##### **Mount G-drive folder and access files**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
!ls '/content/gdrive/My Drive/Text - A3'

Mounted at /content/gdrive
data  results  word2vec_model.bin
time: 6.54 s (started: 2025-01-03 15:33:52 +00:00)


In [None]:
model_folder_path = '/content/gdrive/My Drive/Text - A3/'
folder_path = '/content/gdrive/My Drive/Text - A3/data'
name_of_train_data = 'train.csv'
name_of_test_data = 'test.csv'

time: 625 µs (started: 2025-01-03 15:33:59 +00:00)


In [None]:
path_to_train_data = folder_path + '/' + name_of_train_data
path_to_test_data = folder_path + '/' + name_of_test_data

time: 586 µs (started: 2025-01-03 15:33:59 +00:00)


##### **Installing dependencies**

In [None]:
!pip install ipython-autotime gdown

time: 4.95 s (started: 2025-01-03 15:33:59 +00:00)


##### **Importing dependencies**

In [None]:
%load_ext autotime
import pandas as pd
import numpy as np
import nltk
import os
import zipfile
import tarfile
import re
import gdown
import gzip
import shutil

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# from nltk.corpus import stopwords
# from nltk import word_tokenize
# from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# from google.colab import files
# from scipy.sparse import hstack
# from gensim.models import Word2Vec

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 2.93 ms (started: 2025-01-03 15:34:04 +00:00)


##### **Baseline DistilBERT - no prelim cleaning**

In [None]:
test_df = pd.read_csv(path_to_test_data)

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

predictions = classifier(list(test_df["review"]), truncation=True, max_length=512)

label_mapping = {'positive': 1, 'negative': 0}
predicted_labels = [label_mapping[p["label"].lower()] for p in predictions]

test_df["sentiment_binary"] = test_df["sentiment"].map(label_mapping)

accuracy = accuracy_score(test_df["sentiment_binary"], predicted_labels)
precision = precision_score(test_df["sentiment_binary"], predicted_labels)
recall = recall_score(test_df["sentiment_binary"], predicted_labels)
f1 = f1_score(test_df["sentiment_binary"], predicted_labels)

print("\nBaseline Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(test_df["sentiment_binary"], predicted_labels, target_names=['negative', 'positive']))

results = {
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1],
}

results_df = pd.DataFrame(results)

report_dict = classification_report(test_df["sentiment_binary"], predicted_labels, target_names=['negative', 'positive'], output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

with pd.ExcelWriter('/content/3_Baseline_DistilBERT_nocleaning.xlsx') as writer:
    results_df.to_excel(writer, sheet_name='Metrics', index=False)
    report_df.to_excel(writer, sheet_name='Classification Report', index=True)

from google.colab import files
files.download('/content/3_Baseline_DistilBERT_nocleaning.xlsx')

Device set to use cuda:0



Baseline Evaluation Metrics:
Accuracy: 0.8891
Precision: 0.9143
Recall: 0.8603
F1 Score: 0.8865

Detailed Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.92      0.89      9935
    positive       0.91      0.86      0.89     10065

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

time: 3min 37s (started: 2025-01-03 14:59:06 +00:00)


In [None]:
test_df = pd.read_csv(path_to_test_data)

def clean_review(review):
    review = re.sub(r'<.*?>', '', review)
    review = re.sub(r'http\S+|www\S+|https\S+', '', review, flags=re.MULTILINE)
    review = review.strip()
    return review

test_df["review"] = test_df["review"].apply(clean_review)

classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")

predictions = classifier(list(test_df["review"]), truncation=True, max_length=512)

label_mapping = {'positive': 1, 'negative': 0}
predicted_labels = [label_mapping[p["label"].lower()] for p in predictions]

test_df["sentiment_binary"] = test_df["sentiment"].map(label_mapping)

accuracy = accuracy_score(test_df["sentiment_binary"], predicted_labels)
precision = precision_score(test_df["sentiment_binary"], predicted_labels)
recall = recall_score(test_df["sentiment_binary"], predicted_labels)
f1 = f1_score(test_df["sentiment_binary"], predicted_labels)

print("\nBaseline Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(test_df["sentiment_binary"], predicted_labels, target_names=['negative', 'positive']))

results = {
    'Accuracy': [accuracy],
    'Precision': [precision],
    'Recall': [recall],
    'F1 Score': [f1],
}

results_df = pd.DataFrame(results)

report_dict = classification_report(test_df["sentiment_binary"], predicted_labels, target_names=['negative', 'positive'], output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()

with pd.ExcelWriter('/content/4_Baseline_DistilBERT_final.xlsx') as writer:
    results_df.to_excel(writer, sheet_name='Metrics', index=False)
    report_df.to_excel(writer, sheet_name='Classification Report', index=True)

from google.colab import files
files.download('/content/4_Baseline_DistilBERT_final.xlsx')

Device set to use cuda:0



Baseline Evaluation Metrics:
Accuracy: 0.8904
Precision: 0.9163
Recall: 0.8610
F1 Score: 0.8878

Detailed Classification Report:
              precision    recall  f1-score   support

    negative       0.87      0.92      0.89      9935
    positive       0.92      0.86      0.89     10065

    accuracy                           0.89     20000
   macro avg       0.89      0.89      0.89     20000
weighted avg       0.89      0.89      0.89     20000



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

time: 3min 50s (started: 2025-01-03 15:34:08 +00:00)
