# **Importing required modules**
* pip install pandas
* pip install torch
* pip install sklearn
* pip install transformers
* pip install textblob

In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from textblob import TextBlob 

### **Importing file**
* Change directory in read_json to file location on your pc

In [None]:
training_data = pd.read_json('data_files/train.json')
training_data = pd.DataFrame(training_data)

# **Cleaning Training Data**

### **Checking that there are no None values in 'Reviews' or 'Sentiment' Column**
* Output shows that there are no None values

In [None]:
training_data.isnull().sum() 

### **Checking that there are no duplicate data in the training dataset**
* Output shows there are 216 instances of duplicate data

In [None]:
training_data.duplicated().sum()

### **Removing duplicate data from training dataset**

In [None]:
training_data=training_data.drop_duplicates()

### **Checking and correcting simple spelling mistakes**
* IE : (Reccomend -> Recommend)
* **WARNING: TAKES VERY LONG TO RUN (30 mins++)** 

In [None]:
def corrected_spelling(text):
    return str(TextBlob(text).correct())

training_data['reviews']=training_data['reviews'].apply(corrected_spelling)
training_data.to_json(path_or_buf='data_files/cleaned_train.json', orient='records',lines=True) # Save point

### **Start here if you do not want to re-preprocess data**
* Loads saved pre-processed data

In [None]:
training_data=pd.read_json('data_files/cleaned_train.json',lines=True)
training_data=pd.DataFrame(training_data)

### **Checking for class imbalance**
* Training data has 85% positve reviews
* Class imbalance - impact can be reduced via class weights in training phase

In [None]:
print(training_data.sentiments.value_counts())
print(6139/(1049+6139)*100,"%")

## **Splitting Training Dataset into a training dataset and testing dataset for binary sentiment classification**
* Random Seed = 42
* Train = 75%, Test = 25% is used but Train = 80%, Test = 20% is another possibility

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(training_data['reviews'], training_data['sentiments'], test_size=0.25, random_state=42)

## **Importing pretrained bert model and tokenizer**

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

### **Convert the cleaned data into a readable format for the training process using tensor**

In [None]:
# Tokenize the reviews and convert them into BERT's input format
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=128)

# Convert the labels into torch tensors
train_labels = torch.tensor(train_labels.tolist())
test_labels = torch.tensor(test_labels.tolist())

# Create a dataset class to load the data into PyTorch
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

## **Training Process & parameters**
* **WARNING: THIS TAKES 1-2 HRS ON MY PC**

In [None]:
from transformers import TrainerCallback
import matplotlib.pyplot as plt
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import logging


#Set up logging to a file
logging.basicConfig(filename='training.log', level=logging.INFO, format='%(message)s')

torch.cuda.empty_cache()


training_args = TrainingArguments(
    output_dir='./results',          #output directory for model checkpoints
    num_train_epochs=3,              # number of training epochs
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.1,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
)


class LossLoggerCallback(TrainerCallback):
    def __init__(self):
        self.train_losses = []
        self.eval_losses = []
    
    
    def on_evaluate(self, args, state, control, metrics, **kwargs):
       
        if 'eval_loss' in metrics:
            self.eval_losses.append(metrics['eval_loss'])
            print(f"Evaluation loss appended: {metrics['eval_loss']}")  #debug

            print("Current log history:", state.log_history)  #debug

loss_logger = LossLoggerCallback()


trainer = Trainer(
    model=model,                         #the pre-trained BERT model
    args=training_args,                  #training arguments, defined above
    train_dataset=train_dataset,         #training dataset
    eval_dataset=test_dataset,            #evaluation dataset
    callbacks=[loss_logger],
)

#train model
trainer.train()

## **Performing analysis on test set & Calculation of  Accuracy, Precision, Recall, F1-Score for further fine tuning**

In [None]:
#display training and validation graphs

train_losses = []
steps = []

for log in trainer.state.log_history:
    if 'loss' in log:
        train_losses.append(log['loss'])
        steps.append(log['step'])

#ensuring both training and eval steps start from 100
train_steps = [i * 100 + 100 for i in range(len(train_losses))]  

eval_steps = [i * 100 + 100 for i in range(len(loss_logger.eval_losses))]  

plt.figure(figsize=(10, 6))

#plot training loss (using train_steps)
plt.plot(train_steps, train_losses, label='Training Loss', color='blue')

#plot evaluation loss (using eval_steps)
plt.plot(eval_steps, loss_logger.eval_losses, label='Evaluation Loss', color='orange')

#labels, title & legend
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training and Evaluation Loss Comparison')

plt.legend()

#show plot
plt.tight_layout()
plt.show()



In [None]:
#predict on test dataset
predictions = trainer.predict(test_dataset)
preds = torch.argmax(torch.from_numpy(predictions.predictions), axis=-1)

#calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(test_labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, preds, average='binary')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

# **Save model and tokenizer**
* **DO NOT RUN if you have not trained the model above** 

In [None]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

## **Load model and tokenizer**
* **Start here to skip data pre-processing & model training** 

In [None]:
tokenizer = BertTokenizer.from_pretrained('./saved_model')
model = BertForSequenceClassification.from_pretrained('./saved_model')

# **Import test.json for prediction**

In [None]:
prediction_data = pd.read_json('data_files/test.json')
prediction_data = pd.DataFrame(prediction_data)

# **Performing prediction and storing to new data frame**

In [None]:
def predict_sentiment(review):
    # Tokenize the review
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=128)
    
    # Perform the prediction
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Return Sentiment Value (0 for negative, 1 for postive)
    return torch.argmax(probs).item()

# Stores the sentiments in a list
prediction_list = []
for row in range(len(prediction_data)):
    prediction_list.append(predict_sentiment(prediction_data.iloc[row][0]))

# Append the sentiment list to the a new dataframe
finalized_prediction = prediction_data.assign(sentiment=prediction_list)

# **Store the finalized prediction in CSV File**

In [None]:
# Save the new datafram into a json file
finalized_prediction.to_csv(path_or_buf='submission.csv')