### Installing and Importing of Packages

In [1]:
!pip install -q transformers
!pip install tensorflow



In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, pipeline

### Reading and Wrangling of Data

In [3]:
data = pd.read_json('train.json')
data.head()

Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7401 entries, 0 to 7400
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviews     7401 non-null   object
 1   sentiments  7401 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 115.8+ KB


In [6]:
data['reviews'] = data['reviews'].astype('string')

### Train Test Split

In [7]:
# First split: Separate out the training data
train_data, temp_data = train_test_split(data, test_size=0.3, stratify=data['sentiments'])

# Second split: Divide the remaining data into validation and test sets
val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['sentiments'])

### Pre-training Performance (Before Fine-tuning)

In [8]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize and truncate the reviews
tokenized_data = tokenizer(test_data['reviews'].tolist(), truncation=True, padding='max_length', max_length=512, return_tensors="tf")
truncated_reviews = tokenizer.batch_decode(tokenized_data["input_ids"].numpy(), skip_special_tokens=True)

# Predict the sentiments
sentiment_pipeline = pipeline("sentiment-analysis", framework="tf")
results = sentiment_pipeline(truncated_reviews)

print(results)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
All PyTorch model weights were used when initializing TFDistilBertForSequenceClassification.

All the weights of TFDistilBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForSequenceClassification for predictions without further training.


[{'label': 'POSITIVE', 'score': 0.9988687634468079}, {'label': 'POSITIVE', 'score': 0.9995132684707642}, {'label': 'POSITIVE', 'score': 0.9997289776802063}, {'label': 'POSITIVE', 'score': 0.9997246861457825}, {'label': 'POSITIVE', 'score': 0.9995505213737488}, {'label': 'NEGATIVE', 'score': 0.9969690442085266}, {'label': 'NEGATIVE', 'score': 0.9993168115615845}, {'label': 'NEGATIVE', 'score': 0.9973822236061096}, {'label': 'POSITIVE', 'score': 0.9995887875556946}, {'label': 'NEGATIVE', 'score': 0.8857846260070801}, {'label': 'POSITIVE', 'score': 0.998029887676239}, {'label': 'POSITIVE', 'score': 0.9871737957000732}, {'label': 'POSITIVE', 'score': 0.9996613264083862}, {'label': 'POSITIVE', 'score': 0.999056875705719}, {'label': 'POSITIVE', 'score': 0.999539852142334}, {'label': 'POSITIVE', 'score': 0.9996011853218079}, {'label': 'NEGATIVE', 'score': 0.9994627833366394}, {'label': 'POSITIVE', 'score': 0.9909000396728516}, {'label': 'POSITIVE', 'score': 0.9989479184150696}, {'label': 'POS

In [9]:
true_labels = test_data['sentiments'].tolist()
probabilities = [[1 - r['score'], r['score']] if r['label'] == 'POSITIVE' else [r['score'], 1 - r['score']] for r in results]

predicted_labels = tf.argmax(probabilities, axis=-1).numpy()
positive_class_probabilities = [r['score'] if r['label'] == 'POSITIVE' else 1 - r['score'] for r in results]

# Cross-entropy loss
loss = tf.keras.losses.SparseCategoricalCrossentropy()
print("Loss:", loss(true_labels, probabilities).numpy())

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

# ROC AUC
roc_auc = roc_auc_score(true_labels, positive_class_probabilities)
print("ROC AUC:", roc_auc)

# Confusion Matrix
cm = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:\n", cm)

Loss: 0.50376594
Accuracy: 0.8694869486948695
ROC AUC: 0.9536253268100747
Confusion Matrix:
 [[147  16]
 [129 819]]


### Training of Model (Fine-tuning)

In [10]:
train_texts = train_data['reviews'].tolist()
train_labels = train_data['sentiments'].tolist()

val_texts = val_data['reviews'].tolist()
val_labels = val_data['sentiments'].tolist()

# Initialize the model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Tokenize the reviews
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))

# Slice datasets into batches
train_dataset = train_dataset.shuffle(len(train_texts), seed=1).batch(16).repeat()
val_dataset = val_dataset.batch(64)

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
earlystop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=3, restore_best_weights=True)

model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

# Train the model
model.fit(train_dataset, epochs=20, callbacks=[earlystop], validation_data=val_dataset, steps_per_epoch=324)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/20
 18/324 [>.............................] - ETA: 1:29:57 - loss: 0.4531 - accuracy: 0.8160

KeyboardInterrupt: 

### Post-training Performance (After Fine-tuning)

In [None]:
input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']
batch_size = 32
predictions = []

# Predict sentiments in batches to avoid RAM overload
for i in range(0, len(input_ids), batch_size):
    batch_input_ids = input_ids[i:i + batch_size]
    batch_attention_mask = attention_mask[i:i + batch_size]
    batch_predictions = model.predict({'input_ids': batch_input_ids, 'attention_mask': batch_attention_mask})
    batch_predictions = batch_predictions['logits']
    predictions.extend(batch_predictions)



In [None]:
predictions = np.array(predictions, dtype=np.float32)
probabilities = tf.nn.softmax(predictions, axis=-1)

predicted_labels = tf.argmax(probabilities, axis=-1).numpy()
positive_class_probabilities = probabilities[:, 1]

# Cross-entropy loss
loss = tf.keras.losses.SparseCategoricalCrossentropy()
print("Loss:", loss(true_labels, probabilities).numpy())

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print("Accuracy:", accuracy)

# ROC AUC
roc_auc = roc_auc_score(true_labels, positive_class_probabilities)
print("ROC AUC:", roc_auc)

# Confusion Matrix
cm = confusion_matrix(true_labels, predicted_labels)
print("Confusion Matrix:\n", cm)

Loss: 0.15977573
Accuracy: 0.9387938793879388
ROC AUC: 0.9627824803913956
Confusion Matrix:
 [[112  51]
 [ 17 931]]


### Prediction of Sentiments in "test.json"

In [None]:
data = pd.read_json('test.json')
data['reviews'] = data['reviews'].astype('string')

# Tokenize the reviews
tokenized_data = tokenizer(data['reviews'].tolist(), truncation=True, padding='max_length', max_length=512, return_tensors="tf")

input_ids = tokenized_data['input_ids']
attention_mask = tokenized_data['attention_mask']
batch_size = 32
predictions = []

# Predict sentiments in batches to avoid RAM overload
for i in range(0, len(input_ids), batch_size):
    batch_input_ids = input_ids[i:i + batch_size]
    batch_attention_mask = attention_mask[i:i + batch_size]
    batch_predictions = model.predict({'input_ids': batch_input_ids, 'attention_mask': batch_attention_mask})
    batch_predictions = batch_predictions['logits']
    predictions.extend(batch_predictions)

predictions = np.array(predictions, dtype=np.float32)
probabilities = tf.nn.softmax(predictions, axis=-1)

predicted_labels = tf.argmax(probabilities, axis=-1).numpy()

# Print the predicted labels
print(predicted_labels)

[0 1 1 ... 1 1 1]


### Export the Results to "submission.csv"

In [None]:
data['predicted_sentiments'] = predicted_labels
submission_df = data[['reviews', 'predicted_sentiments']]
submission_df.to_csv('submission.csv', index=False)