### Retrain a fine-tuned Albert model on a new batch of data

In [None]:
import pandas as pd
import numpy as np
import string
from unidecode import unidecode
import tensorflow as tf 
from sklearn.utils import class_weight
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import cloudpickle

### Fine-tuning code can be found at https://github.com/ksv-muralidhar/hugging_face_tf_fine_tuning/blob/main/albert_lime_sentiment_analysis.ipynb

In [2]:
# https://www.kaggle.com/datasets/kritanjalijain/amazon-reviews?select=train.csv
data = pd.read_csv("sentiment_train.csv", names=['category', 'title', 'desc'])

In [3]:
data.shape

(3600000, 3)

In [4]:
# Assuming it to be new batch of data
data = data.sample(n=50000, random_state=11)

In [5]:
data

Unnamed: 0,category,title,desc
1804224,2,You have to have this kit,Thought maybe the price of this kit was too hi...
1804656,2,a must buy for your guy,The best cologne I've ever smelled....one spra...
1594094,2,Gloriously frustrating!!!,Good resource for interesting tidbits. You'll ...
2518751,2,Exactly what is advertised at half the price.,This is the pantone book that is very importan...
1600297,1,The funny little woman and her adventure,This little Japanese woman loses a rice dumpli...
...,...,...,...
2475094,1,Not recommended for heavy bags,I have an unusually heavy laptop bag for work ...
2381779,2,Great Sheet!,I've been very pleased with this sheet and all...
1071675,1,Beginners only,I found the book to be valuable for a beginner...
1602950,1,"The music was great, but the film bombed.","Not really Shah Rukh's best, but a decent watc..."


In [6]:
data ['text'] = data['title'] + " " + data['desc']
data = data[['text', 'category']].copy()

In [7]:
data = data.reset_index(drop=True)

In [8]:
data

Unnamed: 0,text,category
0,You have to have this kit Thought maybe the pr...,2
1,a must buy for your guy The best cologne I've ...,2
2,Gloriously frustrating!!! Good resource for in...,2
3,Exactly what is advertised at half the price. ...,2
4,The funny little woman and her adventure This ...,1
...,...,...
49995,Not recommended for heavy bags I have an unusu...,1
49996,Great Sheet! I've been very pleased with this ...,2
49997,Beginners only I found the book to be valuable...,1
49998,"The music was great, but the film bombed. Not ...",1


In [9]:
data['text'][0]

"You have to have this kit Thought maybe the price of this kit was too high but not so. I really think I got my monies worth. I have owned the vacuum cleaner for 3 years and put off buying the kit. What was I thinking? Excellent product just like the vacuum cleaner. I was very pleased when the kit arrived in just 4 days from Mike's Tools. Thank you!"

In [10]:
data.dropna(inplace=True)

In [11]:
with open("sentiment_preprocessor_labelencoder.bin", "rb") as model_file_obj:
    text_preprocessor, label_encoder = cloudpickle.load(model_file_obj)

In [12]:
data['text'] = text_preprocessor.preprocess(data['text'])

In [13]:
data['text'][0]

'kit thought maybe price kit too high but not really think got monies worth owned vacuum cleaner years put buying kit thinking excellent product just like vacuum cleaner very pleased kit arrived just days mike tools thank'

In [14]:
label_dist = pd.Series(data['category']).value_counts()
label_dist

category
2    25036
1    24959
Name: count, dtype: int64

In [15]:
data['category'] = data['category'].replace({1: 'negative', 2: 'positive'})

In [16]:
label_dist = pd.Series(data['category']).value_counts()
label_dist

category
positive    25036
negative    24959
Name: count, dtype: int64

In [17]:
data['target'] = label_encoder.transform(data['category'])

In [18]:
data

Unnamed: 0,text,category,target
0,kit thought maybe price kit too high but not r...,positive,1
1,must buy guy best cologne ever smelled one spr...,positive,1
2,gloriously frustrating good resource interesti...,positive,1
3,exactly advertised half price pantone book ver...,positive,1
4,funny little woman adventure little japanese w...,negative,0
...,...,...,...
49995,not recommended heavy bags unusually heavy lap...,negative,0
49996,great sheet very pleased sheet carter products...,positive,1
49997,beginners only found book valuable beginner bu...,negative,0
49998,music great but film bombed not really shah ru...,negative,0


In [19]:
x = data['text'].copy()
y = data['target'].copy()

In [20]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [21]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((39996,), (9999,), (39996,), (9999,))

In [22]:
x_train, x_test, y_train, y_test = x_train.to_list(), x_test.to_list(), y_train.to_list(), y_test.to_list()

In [23]:
classes_ = sorted([*y.unique()]).copy()
classes_

[0, 1]

In [24]:
from transformers import AlbertTokenizerFast

In [25]:
model_checkpoint = "albert-base-v2"
tokenizer = AlbertTokenizerFast.from_pretrained(model_checkpoint)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

In [26]:
print(x_train[0])
print(tokenizer.tokenize(x_train[0]))
print(tokenizer(x_train[0]))

value nicely constructed easly cleaned first fried turkey great also used wings night yum
['▁value', '▁nicely', '▁constructed', '▁ea', 's', 'ly', '▁cleaned', '▁first', '▁fried', '▁turkey', '▁great', '▁also', '▁used', '▁wings', '▁night', '▁', 'yum']
{'input_ids': [2, 1923, 24050, 2096, 13507, 18, 102, 13143, 64, 10317, 3567, 374, 67, 147, 3433, 343, 13, 18105, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [27]:
strategy = tf.distribute.MirroredStrategy()

In [28]:
BATCH_SIZE = 32 * strategy.num_replicas_in_sync
N_TOKENS = 150
N_CLASSES = len(classes_)

In [29]:
train_tokens = tokenizer(x_train, max_length=N_TOKENS, padding="max_length", truncation=True, return_tensors="tf", return_attention_mask=True, return_token_type_ids=False)
test_tokens = tokenizer(x_test, max_length=N_TOKENS, padding="max_length", truncation=True, return_tensors="tf", return_attention_mask=True, return_token_type_ids=False)

In [30]:
train_tokens[:5]

[Encoding(num_tokens=150, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=150, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=150, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=150, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing]),
 Encoding(num_tokens=150, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])]

In [31]:
train_tf_data = tf.data.Dataset.from_tensor_slices((dict(train_tokens), to_categorical(y_train)))
test_tf_data = tf.data.Dataset.from_tensor_slices((dict(test_tokens), to_categorical(y_test)))

In [32]:
del(data)
del(train_tokens)
del(test_tokens)

In [33]:
train_tf_data=train_tf_data.prefetch(tf.data.AUTOTUNE)
test_tf_data=test_tf_data.prefetch(tf.data.AUTOTUNE)

In [34]:
for i in train_tf_data.take(1):
    print(i)

({'input_ids': <tf.Tensor: shape=(150,), dtype=int32, numpy=
array([    2,  1923, 24050,  2096, 13507,    18,   102, 13143,    64,
       10317,  3567,   374,    67,   147,  3433,   343,    13, 18105,
           3,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,  

In [35]:
from transformers import TFAlbertModel, AlbertConfig
from tensorflow.keras.layers import Input, Dense, Dropout, Average, BatchNormalization
import transformers

In [36]:
# loading the previously fine-tuned model instead of creating a new one and fine-tuning from scratch
previous_model = tf.keras.models.load_model('sentiment_classifier_hf_albert.h5', custom_objects={"TFAlbertModel": transformers.TFAlbertModel})
model_checkpoint = "albert-base-v2"
tokenizer = AlbertTokenizerFast.from_pretrained(model_checkpoint)
previous_model.compile(metrics=["categorical_accuracy"], loss="categorical_crossentropy")

In [37]:
# loading previously fine-tuned model to retrain it
model = tf.keras.models.load_model('sentiment_classifier_hf_albert.h5', custom_objects={"TFAlbertModel": transformers.TFAlbertModel})

In [38]:
model.compile(optimizer=tf.keras.optimizers.Adam(2e-5), metrics=["categorical_accuracy"], loss="categorical_crossentropy")

In [39]:
model.fit(train_tf_data.shuffle(len(train_tf_data)).batch(BATCH_SIZE), validation_data=test_tf_data.shuffle(len(test_tf_data)).batch(BATCH_SIZE), 
          epochs=1)



<keras.callbacks.History at 0x7e089ce353c0>

#### Training accuracy of the retrained model in the 1st training step was 0.93 as opposed to 0.45 of the previously fine-tuned model

#### COMPARE PREVIOUS AND RETRAINED MODELS

In [40]:
def compare_models(previous_model, current_model, test_data, BATCH_SIZE):
    '''
    Compare previous model and retrained model performance on test data from recent batch
    '''
    previous_model_metric = previous_model.evaluate(test_data.batch(BATCH_SIZE), verbose=0)[1]
    current_model_metric = current_model.evaluate(test_data.batch(BATCH_SIZE), verbose=0)[1]
    print(f'Previous model metric: {previous_model_metric}\nCurrent model metric: {current_model_metric}')
    if previous_model_metric >= current_model_metric:
        print('Model performance degraded after retraining')
    else:
        print('Model performance improved after retraining')

In [41]:
compare_models(previous_model=previous_model, current_model=model, 
               test_data=test_tf_data, BATCH_SIZE=BATCH_SIZE)

Previous model metric: 0.9403940439224243
Current model metric: 0.9313931465148926
Model performance degraded after retraining


#### Since, we trained for only 1 epoch, the model isn't fully trained and the model performance degraded.