# Training IMDB with Huggingface preprocessed distilbert

With examples from: 

- https://huggingface.co/docs/transformers/tasks/sequence_classification
- https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator
- https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca
- https://www.kaggle.com/code/satyampd/imdb-sentiment-analysis-using-bert-w-huggingface/notebook


In [None]:
import pandas

# data from https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

imdb_data = pandas.read_csv('../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
imdb_data.head()

In [None]:
import pandas as pd

def convert_label(data, label_col):
    data[label_col] = data[label_col].apply(lambda x: 1 if x == 'positive' else 0)

    return data
pd.options.display.max_rows = 1000
data_preproc = convert_label(imdb_data, 'sentiment')
data_preproc.sample(120)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def convert_tokenize(dataset):
    return dataset['review'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=128))

data_tokenized = convert_tokenize(data_preproc[:1000])
data_tokenized.sample()

In [None]:
from sklearn.model_selection import train_test_split

train_tok, val_tok = train_test_split(data_tokenized, test_size=0.33, shuffle=True)

In [None]:
import tensorflow as tf
import pandas as pd
from tqdm import tqdm 

def convert_to_tfdata(data):
    def gen():
        for i, d in data.iteritems():
            yield ({
                'input_ids': d['input_ids'],
                'attention_mask': d['attention_mask'],
            }, data_preproc.loc[i]['sentiment'])

    return tf.data.Dataset.from_generator(
            gen, 
            ({'input_ids': tf.int32, 'attention_mask': tf.int32}, tf.int64),
            ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None])}, tf.TensorShape([])),
    )

train = convert_to_tfdata(train_tok).batch(32)
val = convert_to_tfdata(val_tok).batch(32)
    

In [None]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-8, clipnorm=1.0),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])


In [None]:
model.summary()

In [None]:
print(train)
print(val)

In [None]:
history = model.fit(train, epochs=5, validation_data=val)

In [None]:
from matplotlib import pyplot as plt

print(history.history)

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

In [None]:
model.save('training-100.tf')

In [None]:
#pred_sentences = imdb_data.loc[:5, 'review'].values

#print(pred_sentences, type(pred_sentences))

pred_sentences = [
   "I have read all of Tom Holland's non-fiction except for 'Dynasty', and have liked and learned something from it all. Not so here: unlike what's written on the cover, this work is not much about the making of the Western mind - at least not in the way that books like 'The Unintended Reformation', 'A Secular Age', and others are. It is a decent, but not first-rate, social history of Christianity in one volume from an agnostic-atheist standpoint, with space constraints leading to superficial coverage and a questionable selection of events to cover vignette-style.",
   'Movie is ok. I enjoyed it in the theatre. Very entertaining story plot and acting. The performance from Robert Downey Jr is just superb.',
    'The movie is ok I guess. Something nice to go with popcorn and stuff.',
    "The central thesis of this book devolves into the Goodness Gracious Me sketch about the Indian father. Everyone was Christian! Diderot? Christian! Voltaire? Christian! Karl Marx? Christian! Charlie Hebdo? Christian! (The Nazis were not Christian, but) Harvey Weinstein (yes, really)? Christian! The Women's March? Christian! It is almost always asserted rather than argued for properly, and it makes the last third of the book very dull indeed.",
    " Tom Holland has a great easy to read writing style, but the argument in this book was ... not his finest. The first half that explores the origins of Christian ideas and the growth / making of the Catholic Church is very interesting. Well researched, well told. I found the connections between Persian religious tradition and early Christianity especially compelling. ",
    " It's not the first book on Systems Theory I've read, but even if this one is described as a primer, it was not time wasted (definitely).",
]

tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
print(tf_predictions)

labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])

In [None]:
pred_sentences = [
    'Movie is ok. I enjoyed it in the theatre. Very entertaining story plot and acting. The performance from Robert Downey Jr is just superb.',
    'The movie is ok I guess. Something nice to go with popcorn and stuff.',
    'Was expecting something better. Passed out sleeping half way of the movie',
    'It really could have been better. I feel the movie was a good material wasted. Overall, so, so.',
]

tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
print(tf_predictions)

labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": ", labels[label[i]])