## Data stage

In [1]:
%matplotlib inline

In [2]:
# Packages required in the Data Stage
import util_funcs
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow import keras


In [3]:
## Setup experiment directories
BASE_DIR = 'store'
data_dir, model_dir, vocab_dir = util_funcs.set_experiment_dirs(BASE_DIR)
print(
    f'base directory: {BASE_DIR}\n\n'
    f'data: {data_dir}\n'
    f'model: {model_dir}\n'
    f'vocab: {vocab_dir}\n'
)

base directory: store

data: store/data
model: store/model
vocab: store/vocab



In [4]:
# Set the column width so to see the entire length of the `title` column
pd.set_option('display.max_colwidth', None)

## Read the data from store 
all_data = pd.read_csv(f'{data_dir}/all_news_data.csv')


In [5]:
## Check the proportion of classes
all_data.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str)+'%'

topic
BUSINESS         13.8%
ENTERTAINMENT    13.8%
HEALTH           13.8%
NATION           13.8%
SCIENCE           3.5%
SPORTS           13.8%
TECHNOLOGY       13.8%
WORLD            13.8%
Name: proportion, dtype: object

In [6]:
## Split all data into train, test and dev (or validation) sets - 60%, 20%, 20% split such that class proportions are maintained
## Stratified sampling
train_df, test_df = train_test_split(all_data, test_size=0.2, stratify = all_data['topic'])
train_df, dev_df = train_test_split(train_df, test_size=0.20, stratify = train_df['topic'])


In [7]:
## Check the proportion of classes in split datasets
print(train_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str)+'%')
print(test_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str)+'%')
print(dev_df.topic.value_counts(normalize=True).sort_index().mul(100).round(1).astype(str)+'%')


topic
BUSINESS         13.8%
ENTERTAINMENT    13.8%
HEALTH           13.8%
NATION           13.8%
SCIENCE           3.5%
SPORTS           13.8%
TECHNOLOGY       13.8%
WORLD            13.8%
Name: proportion, dtype: object
topic
BUSINESS         13.8%
ENTERTAINMENT    13.8%
HEALTH           13.8%
NATION           13.8%
SCIENCE           3.5%
SPORTS           13.8%
TECHNOLOGY       13.8%
WORLD            13.8%
Name: proportion, dtype: object
topic
BUSINESS         13.8%
ENTERTAINMENT    13.8%
HEALTH           13.8%
NATION           13.8%
SCIENCE           3.5%
SPORTS           13.8%
TECHNOLOGY       13.8%
WORLD            13.8%
Name: proportion, dtype: object


In [8]:
## Save the split datasets in store/data
# Save the datasets
util_funcs.save_data(train_df, data_dir, 'train_data.csv')
util_funcs.save_data(dev_df, data_dir, 'dev_data.csv')
util_funcs.save_data(test_df, data_dir, 'test_data.csv')

In [9]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.plot(history.history['val_'+metric], '')
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.legend([metric, 'val_'+metric])

## Modeling stage

[TextVectorization()](https://www.tensorflow.org/api_docs/python/tf/keras/layers/TextVectorization) layer for title vectorization </br>
[StringLookup()](https://www.tensorflow.org/api_docs/python/tf/keras/layers/StringLookup) layer to convert the labels/topics to numerical indices

In [18]:
# Working folder for the project
BASE_DIR = './store'

# Max length and vocabulary size used 
MAX_LENGTH = 20
VOCAB_SIZE = 10000

# # Get the subdirectories that contain the experiment files
# data_dir, model_dir, vocab_dir = lab_utils.set_experiment_dirs(BASE_DIR)

# # Load the train and test sets
# train_df = pd.read_csv(f'{data_dir}/train_data.csv')
# dev_df = pd.read_csv(f'{data_dir}/dev_data.csv')
# test_df = pd.read_csv(f'{data_dir}/test_data.csv')

topic_lookup = tf.keras.layers.StringLookup(vocabulary=f'{vocab_dir}/labels.txt', num_oov_indices=0)

## CREATE TEXT ENCODER`
## Setup the String lookup and the Text Vectorization tensors for labels and topics respectively
title_preprocessor = tf.keras.layers.TextVectorization(max_tokens=VOCAB_SIZE, output_sequence_length=MAX_LENGTH)
##Calling adapt() on a TextVectorization layer is an alternative to passing in a precomputed vocabulary on construction via the vocabulary argument.
# Extract the titles from the new training set
train_inputs = train_df['title']
title_preprocessor.adapt(train_inputs)

# Save the new vocabulary
util_funcs.save_vocab(title_preprocessor, vocab_dir)
util_funcs.save_labels(topic_lookup, vocab_dir)

In [19]:
vocab = np.array(title_preprocessor.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'in', 'to', 'be', 'will', 'the', 'of', 'why', '12',
       'another', 'iphone', 'story', 'growth', 'apples', 'chapter',
       'defining', 'for', 'and', 'on'], dtype='<U21')

In [20]:
# Convert the string datasets to Tensorflow datasets
train_ds = util_funcs.df_to_tfdata(train_df, topic_lookup, title_preprocessor, shuffle=True)
dev_ds = util_funcs.df_to_tfdata(dev_df, topic_lookup, title_preprocessor)
test_ds = util_funcs.df_to_tfdata(test_df, topic_lookup, title_preprocessor)


In [None]:
train_ds

In [21]:
for example, label in train_ds.take(1):
    print('texts: ', example.numpy()[:3])
    print()
    print('labels: ', label.numpy()[:3])
# encoded_example = title_preprocessor(example)[:3].numpy()
# encoded_example

texts:  [[1517   71 2808   76 8525  608  889   85 7959   33    6  389  964    0
     0    0    0    0    0    0]
 [ 938    1    1 7779    1   18 5755   32 2869    1    0    0    0    0
     0    0    0    0    0    0]
 [ 367    2    1  356 1598    7 1314    7    1 2355 1896   19    1    0
     0    0    0    0    0    0]]

labels:  [0 6 2]


In [22]:
# Parameters
EMBEDDING_DIM = 24
DENSE_DIM = 24
topic_size = topic_lookup.vocabulary_size()

## create the sequential deep learning model
model1 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim = 10000, output_dim=EMBEDDING_DIM, input_length=20, name="embedding_1" ),
    tf.keras.layers.Dense(units= DENSE_DIM, activation='relu', name="dense_2"),
    tf.keras.layers.Flatten(name = "flatten_1"),
    tf.keras.layers.Dense(units= 8, activation = "softmax", name = "dense_3")
])

model1.compile(optimizer = 'adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [23]:
NUM_EPOCHS = 5
# Train the model. Use the dev set to check if your model is overfitting.
model1.fit(train_ds, epochs=NUM_EPOCHS, validation_data=dev_ds, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7cf719954810>

In [24]:
model1.evaluate(test_ds)



[0.6512677669525146, 0.809377133846283]

In [25]:
# Save the model
model1.save(model_dir)

INFO:tensorflow:Assets written to: store/model/assets


INFO:tensorflow:Assets written to: store/model/assets


### Error Analysis

The first iteration of our model will likely underperform and we would need to make adjustments to improve it. Error analysis helps us determine the part of  process that needs to be updated to see improvement in the model. Likewise, it helps us to avoid focusing on parts that do not greatly affect the results.


#### Prioritizing What to Work On

Looking at the performance of your model on different categories of the data will help you decide how to improve its performance. In this case, you will evaluate the model on each of the 8 classes it's trying to predict.

In [26]:
# Get the list of topics
topics = topic_lookup.get_vocabulary()

# Evaluate the model's performance for each topic
util_funcs.print_metric_per_topic(dev_df, topics, topic_lookup, title_preprocessor, model1)

ACCURACY PER TOPIC:

ENTERTAINMENT: 80.83 
HEALTH: 83.33 
TECHNOLOGY: 87.79 
WORLD: 64.29 
BUSINESS: 100.00 
SPORTS: 91.58 
NATION: 62.54 
SCIENCE: 75.99 


From the results, you can check which ones stand out. If you have a baseline such as human-level performance (HLP), you can measure how far each category is from that value, then focus your efforts on the category that will bring the biggest overall improvement.

On the other hand, this analysis can also help you spot errors. You might notice that performance on the `BUSINESS` topic seems suspiciously high compared to the rest. See if you can find why that is. 

In [27]:
train_b = train_df[train_df.topic =="BUSINESS"]
print(train_b.shape)
train_b['title'].unique()

(9600, 5)


array(["Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story"],
      dtype=object)

<br>
<br>

You might have noticed that the titles for all articles are the same: `"Why iPhone 12 Will Be Another 'Defining Chapter' In Apple's Growth Story"`. The model only learned this pattern, so it will likely not generalize well when real-world business-related titles come in. 

After some investigation, you find out that the previous developer accidentally overwrote the columns while fixing a character encoding. Luckily, there is a backup file (correct_all_news.csv in store) which contains the original values. You can now procede with a new experiment using the correct values. To do so, generate train, dev, and test sets again and save these datasets to a folder named `store2`.

In [28]:
# Set the experiment folder
BASE_DIR = './store2'

# Set the subdirectories that will contain the experiment files
data_dir, model_dir, vocab_dir = util_funcs.set_experiment_dirs(BASE_DIR)

# Load the backup CSV
#combined_df = pd.read_csv(f'./.backup.csv')
combined_df = pd.read_csv(f'./store/correct_all_news.csv')

# Generate train, dev, and test sets as you did before.
train_df, test_df = train_test_split(combined_df, test_size=0.2, stratify=combined_df['topic'])
train_df, dev_df = train_test_split(train_df, test_size=0.25, stratify=train_df['topic'])

# Save the datasets under the E3 folder
util_funcs.save_data(train_df, data_dir, 'train_data.csv')
util_funcs.save_data(dev_df, data_dir, 'dev_data.csv')
util_funcs.save_data(test_df, data_dir, 'test_data.csv')

In [29]:
# Generate a new vocabulary based on the new training set
train_inputs = train_df['title']
title_preprocessor.adapt(train_inputs)

# Save the new vocabulary and labels

util_funcs.save_vocab(title_preprocessor, vocab_dir)
util_funcs.save_labels(topic_lookup, vocab_dir)

In [30]:
import numpy as np
vocab = np.array(title_preprocessor.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'to', 'in', 'the', 'of', 'for', 'and', 'on', 'a',
       'covid19', 'with', 'new', 'as', 'coronavirus', 'is', 'after', 'at',
       'from', 'by'], dtype='<U25')

In [31]:
## Now, we convert the dataframes to tf data & train model again
NUM_EPOCHS = 5

# Convert the dataframes to numeric features. Remember to shuffle the training set.
train_ds = util_funcs.df_to_tfdata(train_df, topic_lookup, title_preprocessor,shuffle=True)
dev_ds = util_funcs.df_to_tfdata(dev_df, topic_lookup, title_preprocessor)
test_ds = util_funcs.df_to_tfdata(test_df, topic_lookup, title_preprocessor)

# Reset the model weights
model2 = util_funcs.model_reset_weights(model1)

# Train the model
model2.fit(train_ds, epochs = NUM_EPOCHS, validation_data = dev_ds, verbose=1)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7cf6e06d6390>

In [33]:
# Evaluate the model on the test set and write the results on the experiment tracker
model2.evaluate(test_ds)

# Save the model to model_dir
model2.save(model_dir)

INFO:tensorflow:Assets written to: ./store2/model/assets


INFO:tensorflow:Assets written to: ./store2/model/assets


In [35]:
##Now evaluate the model again on each topic. You should see the accuracy on the business articles drop from 100% 
##because the model has to learn more words related to the topic.
util_funcs.print_metric_per_topic(dev_df, topics, topic_lookup, title_preprocessor, model2)


ACCURACY PER TOPIC:

ENTERTAINMENT: 83.13 
HEALTH: 80.27 
TECHNOLOGY: 85.73 
WORLD: 62.33 
BUSINESS: 72.67 
SPORTS: 87.07 
NATION: 58.33 
SCIENCE: 74.97 


In [37]:
# Get examples in the dev set that predicted `NATION` but the ground truth label is different
util_funcs.get_errors(model2, dev_df, title_preprocessor, topic_lookup, 'NATION')

label: WORLD
prediction: NATION
title: Tropical Storm Isaias remnants to wash over Quebec, heavy rain

label: SCIENCE
prediction: NATION
title: Explained: A method proposed for converting PPE into biofuels

label: BUSINESS
prediction: NATION
title: Nigeria eyes record 12.65 trillion naira spending plan for 2021 - document

label: ENTERTAINMENT
prediction: NATION
title: Store owner slams ‘teen influencer’ over ‘entitled’ social media request: ‘Please stop asking’

label: WORLD
prediction: NATION
title: Kashamu: Backstory of Obasanjo’s letter to the dead

label: ENTERTAINMENT
prediction: NATION
title: Spotlight on business families amid Supreme Court’s ruling on HUFs

label: BUSINESS
prediction: NATION
title: Groundworks boss banned for false tax returns

label: BUSINESS
prediction: NATION

label: HEALTH
prediction: NATION
title: ESB: Fault In Calverstown.

label: WORLD
prediction: NATION
title: KUL now has connection to 30 cities in 20 countries

label: WORLD
prediction: NATION
title: $

Although some predictions are indeed mistakes, you might notice that some examples might also be related to two categories. For example, this title: `COVID-19 hospital admissions up slightly across St. Louis area` sounds like it can both be a `HEALTH` and `NATION` article. 

You need to ask if the human labellers who provided the ground truth have clear instructions on how to label such topics. If some of them label COVID articles as `HEALTH` while others pick `NATION`, then this ambiguity will likely affect the model negatively.

If a clear rule for choosing a single topic cannot be clearly defined, one way you can improve human-level performance is to allow labelers to select more than one topic. So, instead of just having this table when labelling:

| Title      | Topic |
| -----------| ----- |
| Title 1    |       |
| Title 2    |       |
| Title 3    |       |

They can have something like this instead where they can mark several categories for a title:

| Title      | ENTERTAINMENT | HEALTH | TECHNOLOGY | WORLD | BUSINESS | SPORTS | NATION | SCIENCE |
| -----------| ----- | ----- | ----- | ----- | ----- | ----- | ----- | ----- |
| Title 1    |       |       |       |       |       |       |       |       |
| Title 2    |       |       |       |       |       |       |       |       |
| Title 3    |       |       |       |       |       |       |       |       |

One other approach is to merge certain topics that are related to each other. So instead of having 8 classes, you can decide to only have 6: `ENTERTAINMENT`, `HEALTH`, `BUSINESS`, `SPORTS`, `WORLD and NATION` and `SCIENCE and TECHNOLOGY`. 

When making decisions like these, you need to get buy-in from the product/business owner because this will also impact other aspects of their operations. For example, this might mean that the article will appear in several parts of the News App, or their current system might break because some categories no longer exist.

As a proof-of-concept, let's check if the second to the top class prediction of the model corresponds to the ground truth labels. The model is originally compiled to only get the top prediction of the softmax output. You can recompile the model to reward it if it the ground truth is in the top two predictions. You can use the [SparseTopKCategoricalAccuracy](https://www.tensorflow.org/api_docs/python/tf/keras/metrics/SparseTopKCategoricalAccuracy) metric for that.

In [38]:
# Set the top-K accuracy to 2
model2.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=[tf.keras.metrics.SparseTopKCategoricalAccuracy(k=2)]             
             )

# Check the accuracy
model2.evaluate(dev_ds)



[0.8316344022750854, 0.8820960521697998]

In [40]:
# Print the accuracy per topic
util_funcs.print_metric_per_topic(dev_df, topics, topic_lookup, title_preprocessor, model2)

ACCURACY PER TOPIC:

ENTERTAINMENT: 90.73 
HEALTH: 90.53 
TECHNOLOGY: 92.47 
WORLD: 83.93 
BUSINESS: 85.23 
SPORTS: 91.67 
NATION: 83.53 
SCIENCE: 85.70 
