## Training (Fine-tuning) BERT on the dataset
This notebook extracts the data to be used for the paper 3 titled: 'Most popular Topics in Positive and Negative Sentiments in Amazon Movies and TV Reviews Dataset'
#### Author: Rishikesh Kakde

#### Import Required Libraries

In [1]:
# Import libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset as HFDataset

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Check if a GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# Print the selected device
print(f"Using device: {device}")

Using device: cuda


#### Load the Labeled Training Dataset

In [3]:
# Load the labeled dataset
training_data_path = "training_dataset_labeled.csv"
df = pd.read_csv(training_data_path)

# Display the first few rows
print("Training Data Preview:")
print(df.head())

# Check class distribution
print("Sentiment Distribution:")
print(df['sentiment'].value_counts())

Training Data Preview:
   rating                                               text      timestamp  \
0     1.0  The Kids wanted it and watched.  I thought it ...  1453930439000   
1     3.0  My 7 year old grandson had me watch this.  It ...  1402404892000   
2     5.0  Very good movie and enjoyed it very much.  I l...  1398870334000   
3     5.0  Great series, loved it.  Very easy to get righ...  1397309345000   
4     5.0  This film has become my favorite movie. Denzel...  1566075686049   

  sentiment  
0  negative  
1   neutral  
2  positive  
3  positive  
4   neutral  
Sentiment Distribution:
sentiment
neutral     1277
positive     567
negative     156
Name: count, dtype: int64


#### Map Sentiments to Labels

In [4]:
# Map the sentiments to numeric labels for training
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
df['label'] = df['sentiment'].map(sentiment_mapping)

# Ensure the 'text' and 'label' columns are present
df = df[['text', 'label']].dropna()

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

#### Preprocess Text Data Using BERT Tokenizer

In [5]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

#### Create a Dataset Class

In [6]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    
# Create the datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

#### Load Pre-Trained BERT Model for Fine-Tuning

In [7]:
# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Move the model to the GPU
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

#### Set Up Training Arguments and Trainer

In [8]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=8,  
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Create a data collator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # The pre-trained BERT model
    args=training_args,                  # Training arguments
    train_dataset=train_dataset,         # Training dataset
    eval_dataset=val_dataset,            # Validation dataset
    tokenizer=tokenizer,                 # Tokenizer
    data_collator=data_collator          # Data collator
)

#### Train the Model

In [10]:
# Fine-tune the model on the training dataset
trainer.train()

# Save the trained model
trainer.save_model("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

Epoch,Training Loss,Validation Loss
1,0.7666,0.586901
2,0.29,0.492879
3,0.5313,0.474424


('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

#### Load the Analysis Dataset and Perform Sentiment Classification

In [19]:
# ## Step 1: Load the Analysis Dataset
analysis_data_path = "analysis_dataset.csv"
analysis_df = pd.read_csv(analysis_data_path)

# ## Step 2: Set Up Batch Processing
batch_size = 32  # Process in batches of 32
analysis_texts = analysis_df['text'].tolist()
num_batches = len(analysis_texts) // batch_size + 1

# ## Step 3: Prepare Sentiment Mapping and Model for Prediction
# Reverse mapping for sentiment labels
reverse_sentiment_mapping = {v: k for k, v in sentiment_mapping.items()}

# Ensure the model is in evaluation mode
model.eval()

# ## Step 4: Predict Sentiments in Batches
predictions = []
for i in range(num_batches):
    # Extract batch texts
    batch_texts = analysis_texts[i * batch_size : (i + 1) * batch_size]
    
    # Tokenize and move to GPU
    encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
    encodings = {key: val.to(device) for key, val in encodings.items()}
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**encodings)
        batch_predictions = torch.argmax(outputs.logits, dim=1).cpu().tolist()
        predictions.extend(batch_predictions)

# ## Step 5: Map Predictions Back to Sentiment Labels
analysis_df['predicted_sentiment'] = [reverse_sentiment_mapping[pred] for pred in predictions]

# ## Step 6: Save Results to CSV
analysis_df.to_csv('analysis_dataset_with_sentiments.csv', index=False)
print("Analysis dataset with sentiments saved as 'analysis_dataset_with_sentiments.csv'.")


Analysis dataset with sentiments saved as 'analysis_dataset_with_sentiments.csv'.


In [20]:
analysis_df.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase',
       'predicted_sentiment'],
      dtype='object')

#### Comparison with EMPATH Predictions

In [12]:
# ## Step 1: Load the Analysis Dataset
# Load the analysis dataset
analysis_data_path = "analysis_dataset.csv"
analysis_df = pd.read_csv(analysis_data_path)

# Display the first few rows
print("Analysis Data Preview:")
print(analysis_df.head())


Analysis Data Preview:
   rating                                              title  \
0     5.0                                         Five Stars   
1     5.0                                         Five Stars   
2     3.0                       Some decent moments...but...   
3     4.0  Decent Depiction of Lower-Functioning Autism, ...   
4     5.0                                    What Love Is...   

                                                text images        asin  \
0           Amazon, please buy the show! I'm hooked!     []  B013488XFS   
1                         My Kiddos LOVE this show!!     []  B00CB6VTDS   
2  Annabella Sciorra did her character justice wi...     []  B096Z8Z3R6   
3  ...there should be more of a range of characte...     []  B09M14D9FZ   
4  ...isn't always how you expect it to be, but w...     []  B001H1SVZC   

  parent_asin                       user_id      timestamp  helpful_vote  \
0  B013488XFS  AGGZ357AO26RQZVRLGU4D4N52DZQ  1440385637000       

In [15]:
# ## Step 2: Classify Reviews Using EMPATH
from empath import Empath

# Initialize the Empath tool
lexicon = Empath()

# Function to classify sentiment based on the review text
def assign_sentiment_with_empath(text):
    if not isinstance(text, str) or len(text.strip()) == 0:
        return None  # Handle empty or missing text
    analysis = lexicon.analyze(text, categories=['positive_emotion', 'negative_emotion'])
    positive_score = analysis.get('positive_emotion', 0)
    negative_score = analysis.get('negative_emotion', 0)
    
    # Determine sentiment based on scores
    if positive_score > negative_score:
        return 'positive'
    elif negative_score > positive_score:
        return 'negative'
    else:
        return 'neutral'

# Apply the sentiment analysis function to the 'text' column
analysis_df['ground_truth_sentiment'] = analysis_df['text'].apply(assign_sentiment_with_empath)

# Drop rows where sentiment could not be assigned
analysis_df = analysis_df.dropna(subset=['ground_truth_sentiment'])

# Display the sentiment distribution
print("Sentiment Distribution (Empath Ground Truth):")
print(analysis_df['ground_truth_sentiment'].value_counts())


Sentiment Distribution (Empath Ground Truth):
ground_truth_sentiment
neutral     767
positive    520
negative    213
Name: count, dtype: int64


In [16]:
# ## Step 4: Compare Model Predictions to Ground Truth
# Ensure both ground truth and model predictions exist in the dataset
if 'predicted_sentiment' in analysis_df.columns:
    # Calculate the accuracy
    accuracy = (analysis_df['predicted_sentiment'] == analysis_df['ground_truth_sentiment']).mean()
    print(f"Model Accuracy Compared to EMPATH Ground Truth: {accuracy * 100:.2f}%")
    
    # Display a confusion matrix for detailed analysis
    from sklearn.metrics import confusion_matrix, classification_report

    # Generate the confusion matrix
    cm = confusion_matrix(analysis_df['ground_truth_sentiment'], analysis_df['predicted_sentiment'], labels=['positive', 'neutral', 'negative'])
    print("Confusion Matrix:")
    print(cm)

    # Generate a classification report
    report = classification_report(analysis_df['ground_truth_sentiment'], analysis_df['predicted_sentiment'], labels=['positive', 'neutral', 'negative'])
    print("Classification Report:")
    print(report)
else:
    print("Model predictions are not available. Run the BERT model to generate 'predicted_sentiment' column.")


Model predictions are not available. Run the BERT model to generate 'predicted_sentiment' column.
