#### Import the necessary libraries

In [1]:
import os   # Importing the os module to interact with the operating system and manage file paths and directories
import pandas as pd     # Importing pandas for data manipulation
from sklearn.model_selection import train_test_split    # Importing train_test_split from sklearn for splitting the data into training and validation sets

# Importing the BERT tokenizer to convert text reviews into token IDs suitable for BERT
# Importing AdamW as the optimizer for training BERT models, which includes weight decay for regularization
from transformers import BertTokenizer, AdamW

import torch        # Importing torch, the core library for deep learning in PyTorch
print(torch.__version__)    # Print the version of PyTorch being used for this project

# Importing necessary classes from PyTorch to handle data batching and sampling
# DataLoader: Used to load batches of data during training and evaluation
# TensorDataset: Converts input features and labels into tensors that can be processed by the model
# RandomSampler: Randomly samples data (typically for training) to introduce randomness and reduce overfitting
# SequentialSampler: Samples data sequentially (typically for evaluation) without shuffling
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler

  from .autonotebook import tqdm as notebook_tqdm


2.5.1+cu118


#### Check if CUDA is available

In [2]:
print(torch.cuda.is_available())  # Should return True if CUDA is available
print(torch.cuda.device_count())  # Should be > 0 if CUDA is enabled

True
1


#### Load train data


In [3]:
# Load the training data from a JSON file into a pandas DataFrame
train_df = pd.read_json('Data Samples/train.json')

# Print the shape of the DataFrame to display the number of rows and columns
print(train_df.shape)

# Display the first few rows of the DataFrame to inspect the contents and structure of the dataset
train_df.head()

(7401, 2)


Unnamed: 0,reviews,sentiments
0,I bought this belt for my daughter in-law for ...,1
1,The size was perfect and so was the color. It...,1
2,"Fits and feels good, esp. for doing a swim rac...",1
3,These socks are absolutely the best. I take pi...,1
4,Thank you so much for the speedy delivery they...,1


#### We first need to tokenize the reviews and prepare the data for BERT. 
##### BERT expects tokenized input in the form of input IDs, attention masks, and possibly token type IDs.

In [4]:
# Instantiate the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess the reviews
def encode_reviews(reviews, tokenizer, max_length=256):
    return tokenizer.batch_encode_plus(
        reviews, 
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        return_attention_mask=True,  # Return attention masks
        padding='max_length',  # Pad to max_length
        truncation=True,  # Truncate longer sequences
        max_length=max_length,  # Maximum length of sequences
        return_tensors='pt'  # Return PyTorch tensors
    )

#### Encode reviews and Prepare Labels

In [5]:
train_encodings = encode_reviews(train_df['reviews'].tolist(), tokenizer)

# Step 1: Prepare Labels
labels = torch.tensor(train_df['sentiments'].values)

#### Step 2: Split Data into Training and Validation Sets

In [6]:
# Step 2: Split Data into Training and Validation Sets
train_inputs, val_inputs, train_labels, val_labels = train_test_split(
    train_encodings['input_ids'], labels, test_size=0.1, random_state=42
)
train_masks, val_masks, _, _ = train_test_split(
    train_encodings['attention_mask'], labels, test_size=0.1, random_state=42
)

#### Step 3: Create DataLoaders for Efficient Batching

In [7]:
# Step 3: Create DataLoaders for Efficient Batching
batch_size = 16 # Set the batch size, which determines how many samples are processed in each batch

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

#### Import the DistilBERT model and set the number of labels

`https://huggingface.co/docs/transformers/model_doc/distilbert`

In [8]:
# Importing the DistilBERT model for sequence classification from the transformers library
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', 
    
    # Binary classification (0 for negative sentiment, 1 for positive sentiment)
    num_labels=2
)
model.to('cuda' if torch.cuda.is_available() else 'cpu')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


#### Step 5: Set Up Optimizer and Training Loop

##### Set the number of epochs according to the size of your dataset.
##### For example, 
##### - Small Datasets (Typically around 1,000 to 10,000 samples): 3-5
##### - Medium to Large Datasets (Typically around 10,000 to 500,000 samples): 2-3
##### - Very Large Datasets (Typically 500,000 samples or more): 1-2

In [9]:
# Step 5: Set Up Optimizer and Training Loop
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)    # AdamW is an optimization algorithm, widely used for training transformer-based models like BERT

# Training Loop
epochs = 3  # Number of training epochs, i.e., how many times the entire training dataset is passed through the model
            # Adjust this value to change the number of iterations the model will train for

# Training
for epoch in range(epochs):
    print(f"Starting epoch {epoch + 1}")
    
    # Set the model to training mode (enables features like dropout)
    model.train()

    # Variable to accumulate total training loss for this epoch
    total_train_loss = 0

    for batch in train_dataloader:

        # Move the input tensors (input IDs, attention masks, and labels) to the GPU (if available) or CPU
        b_input_ids, b_input_mask, b_labels = tuple(t.to('cuda' if torch.cuda.is_available() else 'cpu') for t in batch)

        # Zero out any previously accumulated gradients
        model.zero_grad()

        # Forward pass: Compute model outputs (predictions and loss)
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss     # Extract the loss value from the model's output
        total_train_loss += loss.item()     # Add the loss value to the running total
        
        # Backward pass: Compute the gradients of the loss with respect to the model parameters
        loss.backward()

        # Update the model parameters using the gradients calculated during the backward pass
        optimizer.step()

    # Calculate the average training loss for the epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average training loss for epoch {epoch + 1}: {avg_train_loss:.2f}")

    # Validation
    # Set the model to evaluation mode (disables dropout and other training-specific operations)
    model.eval()

    # Variable to accumulate total validation loss for this epoch
    total_val_loss = 0

    # Disable gradient calculations to save memory during the validation step
    with torch.no_grad():
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = tuple(t.to('cuda' if torch.cuda.is_available() else 'cpu') for t in batch)

            # Forward pass for validation: Compute model outputs and loss without computing gradients
            outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    # Calculate the average validation loss for the epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Validation loss for epoch {epoch + 1}: {avg_val_loss:.2f}")



Starting epoch 1
Average training loss for epoch 1: 0.21
Validation loss for epoch 1: 0.14
Starting epoch 2
Average training loss for epoch 2: 0.09
Validation loss for epoch 2: 0.15
Starting epoch 3
Average training loss for epoch 3: 0.04
Validation loss for epoch 3: 0.21


#### Save the model into the 'Models' folder

In [10]:
# Step 6: Create the "Models" folder if it doesn't exist
if not os.path.exists('Models'):
    os.makedirs('Models')
    print("The 'Models' folder has been created.")
else:
    print("The 'Models' folder already exists.")

# Step 7: Define the path for saving the model
model_save_path = os.path.join('Models', 'bert_sentiment_model.pt')

# Check if the model file already exists
if os.path.exists(model_save_path):
    print(f"The model file '{model_save_path}' already exists. It will be updated.")
else:
    print(f"The model file '{model_save_path}' does not exist and will be created.")

# Step 8: Save the model's state dictionary
torch.save(model.state_dict(), model_save_path)

# Optional: Save the tokenizer configuration if needed for reloading the model
tokenizer_save_path = os.path.join('Models', 'tokenizer')

# Check if the tokenizer directory already exists
if os.path.exists(tokenizer_save_path):
    print(f"The tokenizer configuration at '{tokenizer_save_path}' already exists. It will be updated.")
else:
    print(f"The tokenizer configuration at '{tokenizer_save_path}' does not exist and will be created.")
    
tokenizer.save_pretrained(tokenizer_save_path)

# Confirmation message
print(f"Model has been saved to {model_save_path}")
print(f"Tokenizer has been saved to {tokenizer_save_path}")

The 'Models' folder already exists.
The model file 'Models\bert_sentiment_model.pt' already exists. It will be updated.
The tokenizer configuration at 'Models\tokenizer' already exists. It will be updated.
Model has been saved to Models\bert_sentiment_model.pt
Tokenizer has been saved to Models\tokenizer


### Now we can apply this model onto our test data samples (test.json)

#### Load the Test Data

In [11]:
test_df = pd.read_json('Data Samples/test.json')
print(test_df.shape)
test_df.head()

(1851, 1)


Unnamed: 0,reviews
0,I bought 2 sleepers. sleeper had holes in the...
1,I dare say these are just about the sexiest th...
2,"everything about the transaction (price, deliv..."
3,"Not bad for just a shirt. Very durable, and m..."
4,These are truly wrinkle free and longer than t...


#### Import the necessary functions for model evaluation

In [12]:
# Importing the necessary functions for model evaluation
# classification_report provides a detailed report on precision, recall, and F1-score for each class
# accuracy_score calculates the overall accuracy of the model.
from sklearn.metrics import classification_report, accuracy_score
import torch

# Step 1: Preprocess Test Data
test_encodings = encode_reviews(test_df['reviews'].tolist(), tokenizer)

#### Prepare the Test DataLoader

In [13]:
# Step 3: Prepare the Test DataLoader
test_inputs = test_encodings['input_ids']
test_masks = test_encodings['attention_mask']
test_data = TensorDataset(test_inputs, test_masks)

# Use SequentialSampler for the test data (no shuffling needed)
test_dataloader = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=batch_size)


#### Step 4: Make Predictions on the Test Set by running the model on the test data

In [14]:
# Step 4: Make Predictions on the Test Set
model.eval()
predictions = []

# Run the model on the test data
with torch.no_grad():
    for batch in test_dataloader:
        b_input_ids, b_input_mask = tuple(t.to('cuda' if torch.cuda.is_available() else 'cpu') for t in batch)

        # Forward pass
        outputs = model(b_input_ids, attention_mask=b_input_mask)
        logits = outputs.logits

        # Get the predicted sentiment (0 or 1 for binary classification)
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)

# Step 5: Add the Predicted Sentiments to the DataFrame
test_df['sentiments'] = predictions

# View the predictions with the reviews
print(test_df.head())

                                             reviews  sentiments
0  I bought 2 sleepers.  sleeper had holes in the...           0
1  I dare say these are just about the sexiest th...           1
2  everything about the transaction (price, deliv...           1
3  Not bad for just a shirt.  Very durable, and m...           1
4  These are truly wrinkle free and longer than t...           1


#### Now that the predictions have been made, we can export this to a csv file for review analysis
#### Create the `submission.csv` file

In [15]:
# Step 6: Create the "Results" folder if it doesn't exist
if not os.path.exists('Results'):
    os.makedirs('Results')
    print("The 'Results' folder has been created.")
else:
    print("The 'Results' folder already exists.")

# Step 7: Save the test dataframe with predictions to 'submission.csv'
submission_file = os.path.join('Results', 'submission.csv')

# Check if the submission file already exists
if os.path.exists(submission_file):
    print("The file 'submission.csv' already exists. It will be updated with new data.")
else:
    print("The file 'submission.csv' does not exist and will be created.")
    
test_df.to_csv(submission_file, index=False)

# Confirm the file has been saved
print(f"Predictions have been saved to {submission_file}")


The 'Results' folder already exists.
The file 'submission.csv' already exists. It will be updated with new data.
Predictions have been saved to Results\submission.csv


#### If you would like to reload the saved model and tokenizer for inference or fine-tuning on another dataset, use the following code (Select All and Uncomment). 
#### This will load the model weights from the saved state dictionary and configure the tokenizer to match the settings used during training.

In [16]:
# # Step 1: Define paths to the model and tokenizer
# model_load_path = 'Models/bert_sentiment_model.pt'
# tokenizer_load_path = 'Models/tokenizer'

# # Step 2: Check if model and tokenizer paths exist, and load them if available
# if os.path.exists(model_load_path) and os.path.exists(tokenizer_load_path):
#     print("Found model and tokenizer files. Proceeding to load them.")
    
#     # Reload the tokenizer from the saved configuration
#     tokenizer = BertTokenizer.from_pretrained(tokenizer_load_path)
#     print("Tokenizer loaded successfully.")
    
#     # Step 3: Reload the model with the same architecture and load the saved state
#     model = DistilBertForSequenceClassification.from_pretrained(
#         'distilbert-base-uncased', 
#         num_labels=2
#     )
    
#     # Load the state dictionary into the model
#     model.load_state_dict(torch.load(model_load_path))
#     print("Model loaded successfully.")
    
#     # Step 4: Send the model to GPU if available
#     device = 'cuda' if torch.cuda.is_available() else 'cpu'
#     model.to(device)
    
#     # Set the model to evaluation mode (disables dropout layers)
#     model.eval()
    
#     print("Model and tokenizer successfully loaded for inference.")
# else:
#     if not os.path.exists(model_load_path):
#         print(f"Model file '{model_load_path}' not found.")
#     if not os.path.exists(tokenizer_load_path):
#         print(f"Tokenizer configuration '{tokenizer_load_path}' not found.")
#     print("Please ensure both model and tokenizer files are available in the 'Models' folder before running this code.")