# Sentiment Analysis With DistilBert

## Import libraries

In [1]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from wordcloud import WordCloud, STOPWORDS

import nltk
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

import tensorflow.compat.v1 as tf
from tensorflow.keras.callbacks import TensorBoard
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers.schedules import PolynomialDecay

from datasets import load_dataset
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer, DataCollatorWithPadding

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!





# load Data

In [3]:
import numpy as np
from datasets import load_dataset

# Load a subset of the 'amazon_polarity'
amazon_train = load_dataset('amazon_polarity', split='train[:20000]')
amazon_test = load_dataset('amazon_polarity', split='test[:2000]')

print("Train Dataset : ", amazon_train.shape)
print("Test Dataset : ", amazon_test.shape)

Downloading readme:   0%|          | 0.00/6.81k [00:00<?, ?B/s]

Train Dataset :  (20000, 3)
Test Dataset :  (2000, 3)


# Load Fine-Tuned DisilBERT Model


In [4]:
# Initialize BERT tokenizer and model
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = TFDistilBertForSequenceClassification.from_pretrained('../Code/fine_tuned_distilbert', num_labels=2)




Some layers from the model checkpoint at ../Code/fine_tuned_distilbert were not used when initializing TFDistilBertForSequenceClassification: ['dropout_79']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at ../Code/fine_tuned_distilbert and are newly initialized: ['dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0 (unused)
                                                                 
Total params: 66955010 (255.41 MB)
Trainable params: 66955010 (255.41 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
# Preprocess the data
def preprocess_data(data):
    inputs = tokenizer(data['content'], truncation=True)
    return inputs

In [8]:
# Tokenize text
tokenized_datasets = amazon_train.map(preprocess_data, batched=True)
tokenized_test_datasets = amazon_test.map(preprocess_data, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

# Create datasets
tf_train_dataset = tokenized_datasets.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'label'],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)


tf_validation_dataset = tokenized_test_datasets.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'label'],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [9]:
# Make prediction
class_preds = np.argmax(model.predict(tf_validation_dataset)["logits"], axis=1)



In [10]:
!pip install datasets evaluate transformers[sentencepiece] -q


[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
from datasets import load_metric

# Define the metric you want to use (e.g., accuracy)
metric_name = "accuracy"
metric = load_metric("accuracy")

# Define the true labels for the validation set
true_labels = tokenized_test_datasets["label"]

# Evaluate the model predictions
results = metric.compute(predictions=class_preds, references=true_labels)

# Print the results
print(f"{metric_name}: {results['accuracy']}")

In [15]:
# Create lists to store positive and negative sentences
positive_sentences = [amazon_test['content'][i] for i, pred in enumerate(class_preds) if pred == 1]
negative_sentences = [amazon_test['content'][i] for i, pred in enumerate(class_preds) if pred == 0]

In [16]:
# Print some examples
print("Positive Reviews:")
for i in range(5):  # Print first 5 positive sentences
    print(positive_sentences[i])
    print("\n")

Positive Reviews:
I love love love the squeem!!! The before and after pictures are definitely real!!! You can immediately tell the difference when you put it on it definitely improves your posture, you eat much less since it squeezes the he'll outta you but its not an uncomfortable squeeze.


We visited the Dingle peninsula of Ireland this fall and had to see this movie that was filmed there. Turns out that, in addition to having great scenes of the Dingle countryside, it also has a very good story line. And, terrific acting and some Irish history thrown in for good measure.


Quite often, bringing a classic story like "Moby Dick" to the screen is a thankless task. But at least John Huston's 1956 version has a lot of class as well any number of great performances (even Moby Dick looked a lot more realistic in 1956!). This version just doesn't make it - skip it and buy a copy of Gregory Peck's rendition instead. You'll be glad you did.


I've studied philosophy to a good degree and ther

In [17]:
# Print some examples
print("Negative Reviews:")
for i in range(5):  # Print first 5 positive sentences
    print(negative_sentences[i])
    print("\n")

Negative Reviews:
My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"


Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too many of those kinds of songs in my other video game soundtracks. I must admit that one of the so

In [18]:
def predict_sentiment(input_text):
    # Preprocess input text
    inputs = tokenizer(input_text, truncation=True, padding=True, return_tensors='tf')

    # Get model prediction
    logits = model(inputs)["logits"]
    predicted_class = np.argmax(logits, axis=1)[0]

    # Determine sentiment label
    sentiment_label = "positive" if predicted_class == 1 else "negative"

    return sentiment_label

In [27]:

# Postive test sample
input_text = "I'm very disappointed with this product. It constantly malfunctions and doesn't live up to its advertised capabilities. I regret buying it."
predicted_sentiment = predict_sentiment(input_text)
print(f"The sentiment is {predicted_sentiment}")

The sentiment is negative


In [28]:

# Negative test sample
input_text = "I'm very disappointed with this product. It constantly malfunctions and doesn't live up to its advertised capabilities. I regret buying it."
predicted_sentiment = predict_sentiment(input_text)
print(f"The sentiment is {predicted_sentiment}")

The sentiment is negative
