In [None]:
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install tensorflow_addons
!pip install tensorflow_text
#!pip install tf-models-official

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub
import tensorflow_text as text  # Registers the ops.
from keras.utils import to_categorical
from official.nlp import optimization  # to create AdamW optimizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Plots
import matplotlib.pyplot as plt
import seaborn as sns


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 



In [None]:
from datasets import load_dataset
dataset = load_dataset("ag_news")



  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
df_train = pd.DataFrame(dataset['train'])
df_train = df_train.groupby('label').apply(lambda x: x.sample(n=4000)).reset_index(drop=True)
df_test = pd.DataFrame(dataset['test'])

In [None]:
TEXT_LABELS = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import multiprocessing as mp

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    words = [word for word in words if word.lower() not in stopwords.words('english')]
    return " ".join(words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df_train['clean_text'] = df_train['text'].apply(remove_stopwords)

In [None]:
df_test['clean_text']=df_test['text'].apply(remove_stopwords)

In [None]:
X_train = df_train['text']


le = LabelEncoder()
y_train = df_train['label']
y_train = le.fit_transform(y_train)
y_train = to_categorical(y_train, num_classes=4)


X_test = df_test['text']
y_test = df_test['label']
y_test = le.transform(y_test)
y_test = to_categorical(y_test, num_classes=4)

In [None]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(text_input)
encoder = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",
    trainable=True)
outputs = encoder(encoder_inputs)
pooled_output = outputs["pooled_output"]      # [batch_size, 768].
sequence_output = outputs["sequence_output"]  # [batch_size, seq_length, 768].

In [None]:
# Verify that the preprocessor works:
hub_inputs = preprocessor(['ID for each word, with zero padding at the end.'])
{key: value[0, :25].numpy() for key, value in hub_inputs.items()}

{'input_word_ids': array([  101,  8909,  2005,  2169,  2773,  1010,  2007,  5717, 11687,
         4667,  2012,  1996,  2203,  1012,   102,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0], dtype=int32),
 'input_mask': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], dtype=int32),
 'input_type_ids': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0], dtype=int32)}

In [None]:
result = encoder(
    inputs=hub_inputs,
    training=False,
)

print("Pooled output shape:", result['pooled_output'].shape)
print("Sequence output shape:", result['sequence_output'].shape)

Pooled output shape: (1, 768)
Sequence output shape: (1, 128, 768)


In [None]:
epochs = 5
batch_size = 128
eval_batch_size = 32

train_data_size = df_train.shape[0]
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                      num_train_steps=num_train_steps,
                      num_warmup_steps=num_warmup_steps,
                      optimizer_type='adamw')

In [None]:
def build_model(num_classes, optimizer, max_len=512):
    
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(64, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.1)(net)
    out = tf.keras.layers.Dense(num_classes, activation='softmax', name='classifier')(net)
    
    
    model = tf.keras.models.Model(text_input, out)
    model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_model(num_classes=4, optimizer=optimizer)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_mask': (Non  0           ['text[0][0]']                   
                                e, 128),                                                          
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

history = model.fit(x=X_train,
          y=y_train,
          validation_data=(X_test, y_test),
          epochs=epochs, 
          callbacks=[checkpoint, early_stopping], 
          batch_size=batch_size,
          validation_batch_size=eval_batch_size,
          verbose=1)

Epoch 1/5
Epoch 1: val_accuracy improved from -inf to 0.91566, saving model to model.h5
Epoch 2/5
Epoch 2: val_accuracy improved from 0.91566 to 0.92513, saving model to model.h5
Epoch 3/5
Epoch 3: val_accuracy improved from 0.92513 to 0.92816, saving model to model.h5
Epoch 4/5
Epoch 4: val_accuracy did not improve from 0.92816
Epoch 5/5
Epoch 5: val_accuracy did not improve from 0.92816


In [None]:
y_proba = model.predict(X_test, batch_size=eval_batch_size)



In [None]:
y_pred = np.argmax(y_proba, axis=1)

In [None]:
y_pred_labels = [TEXT_LABELS[x] for x in le.inverse_transform(y_pred)]
y_true_labels = [TEXT_LABELS[x] for x in df_test['label']]

In [None]:
print(classification_report(y_true=y_true_labels, y_pred=y_pred_labels))
print(confusion_matrix(y_true_labels, y_pred_labels))

              precision    recall  f1-score   support

    Business       0.91      0.88      0.90      1900
    Sci/Tech       0.89      0.91      0.90      1900
      Sports       0.98      0.98      0.98      1900
       World       0.92      0.94      0.93      1900

    accuracy                           0.93      7600
   macro avg       0.93      0.93      0.93      7600
weighted avg       0.93      0.93      0.93      7600

[[1681  156    4   59]
 [ 108 1722   11   59]
 [   5    4 1864   27]
 [  53   46   22 1779]]
