In [21]:
# Imports the necessary libraries
import torch
import pandas as pd
import numpy as np
import re
import string
import nltk
import pickle
import json
import pyarrow as pa
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset, Features, Value
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Makes sure the required NLTK data is downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Loads the dataset
df = pd.read_csv("/Users/sylviaperez-montero/Desktop/Project/Amazon Data.csv", low_memory=False)
pickle.dump(df, open("dataset.pkl", "wb"))

# Drops unnecessary columns
columns_to_drop = [
    'asins', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen', 
    'reviews.id', 'reviews.didPurchase', 'name',
    'reviews.userCity', 'reviews.userProvince', 'reviews.sourceURLs'
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
pickle.dump(df, open("cleaned_dataset.pkl", "wb"))

# Defines text preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # Removes numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Removes punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Removes extra spaces
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words("english"))  # Removes stopwords
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()  # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

# Applies text preprocessing
if 'reviews.text' in df.columns:
    df["cleaned_reviews_text"] = df["reviews.text"].apply(preprocess_text)

# Drops rows with missing values in relevant columns
df = df.dropna(subset=["cleaned_reviews_text", "reviews.rating"])

# Converts 'reviews.rating' to integer
df["reviews.rating"] = df["reviews.rating"].astype(int)

# Label transformations (1-2 → Negative, 3 → Neutral, 4-5 → Positive)
def transform_labels(rating):
    return 0 if rating <= 2 else (1 if rating == 3 else 2)

df["label"] = df["reviews.rating"].apply(transform_labels)

# ConvertS to Hugging Face Dataset
features = Features({
    "cleaned_reviews_text": Value("string"),
    "label": Value("int32")  
})

hf_dataset = HFDataset.from_pandas(df[["cleaned_reviews_text", "label"]], features=features, preserve_index=False)

# Tokenizer Initialization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Define Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_reviews_text"], padding="max_length", truncation=True, max_length=256)

# Applies Tokenization
hf_dataset = hf_dataset.map(tokenize_function, batched=True)

# Train-Test Split
hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)

# Loads the pretrained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

# Defines training arguments 
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    eval_strategy="epoch",  # ✅ FIXED
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=500,
    metric_for_best_model="accuracy"
)

# Defines performance metrics
from sklearn.metrics import classification_report, confusion_matrix

# Defines performance metrics
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids

    # Computes overall metrics
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")

    # Computes per-class metrics
    class_report = classification_report(labels, preds, target_names=["Negative", "Neutral", "Positive"], output_dict=True)

    # Extracts per-class precision, recall, and F1-score
    class_metrics = {
        "Negative": {"Precision": class_report["Negative"]["precision"], "Recall": class_report["Negative"]["recall"], "F1-score": class_report["Negative"]["f1-score"]},
        "Neutral": {"Precision": class_report["Neutral"]["precision"], "Recall": class_report["Neutral"]["recall"], "F1-score": class_report["Neutral"]["f1-score"]},
        "Positive": {"Precision": class_report["Positive"]["precision"], "Recall": class_report["Positive"]["recall"], "F1-score": class_report["Positive"]["f1-score"]}
    }

    return {
        "accuracy": accuracy, 
        "f1": f1, 
        "precision": precision, 
        "recall": recall, 
        "class_metrics": class_metrics
    }
    
# Trainer setup 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    compute_metrics=compute_metrics  # FIXED
)

# Trains the model
trainer.train()

# Saves the model
trainer.save_model("./distilbert_model")

# Evaluates the model
results = trainer.evaluate()

# Confusion Matrix
predictions = trainer.predict(hf_dataset["test"])
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids
conf_matrix = confusion_matrix(labels, preds)

# Print overall performance
print("\n DistilBERT Performance:")
print(f"Model achieved an accuracy of {results['eval_accuracy']*100:.2f}% on the validation dataset.")

# Print per-class performance
class_metrics = results["eval_class_metrics"]
for class_name, metrics in class_metrics.items():
    print(f"Class {class_name}: Precision={metrics['Precision']*100:.2f}%, Recall={metrics['Recall']*100:.2f}%, F1-score={metrics['F1-score']*100:.2f}%")

# Print confusion matrix
print("\n Confusion Matrix:")
print(conf_matrix)

# Save confusion matrix to a CSV
conf_matrix_df = pd.DataFrame(conf_matrix, index=["Negative", "Neutral", "Positive"], columns=["Predicted Negative", "Predicted Neutral", "Predicted Positive"])
conf_matrix_df.to_csv("confusion_matrix.csv")

[nltk_data] Downloading package punkt to /Users/sylviaperez-
[nltk_data]     montero/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/sylviaperez-
[nltk_data]     montero/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sylviaperez-
[nltk_data]     montero/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Map:   0%|          | 0/34627 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Class Metrics
1,0.2609,0.25859,0.933728,0.901728,0.871848,0.933728,"{'Negative': {'Precision': 0.0, 'Recall': 0.0, 'F1-score': 0.0}, 'Neutral': {'Precision': 0.0, 'Recall': 0.0, 'F1-score': 0.0}, 'Positive': {'Precision': 0.9337279815189142, 'Recall': 1.0, 'F1-score': 0.9657283655641007}}"
2,0.2518,0.258755,0.935749,0.914442,0.894192,0.935749,"{'Negative': {'Precision': 0.38513513513513514, 'Recall': 0.3433734939759036, 'F1-score': 0.3630573248407643}, 'Neutral': {'Precision': 0.0, 'Recall': 0.0, 'F1-score': 0.0}, 'Positive': {'Precision': 0.9477722041900265, 'Recall': 0.9933508582031854, 'F1-score': 0.9700264250660626}}"
3,0.2228,0.23448,0.936038,0.920671,0.91146,0.936038,"{'Negative': {'Precision': 0.4714285714285714, 'Recall': 0.39759036144578314, 'F1-score': 0.43137254901960786}, 'Neutral': {'Precision': 0.25757575757575757, 'Recall': 0.05802047781569966, 'F1-score': 0.0947075208913649}, 'Positive': {'Precision': 0.9523809523809523, 'Recall': 0.9896397092933354, 'F1-score': 0.9706529157503602}}"


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Trainer is attempting to log a value of "{'Negative': {'Precision': 0.0, 'Recall': 0.0, 'F1-score': 0.0}, 'Neutral': {'Precision': 0.0, 'Recall': 0.0, 'F1-score': 0.0}, 'Positive': {'Precision': 0.9337279815189142, 'Recall': 1.0, 'F1-score': 0.9657283655641007}}" of type <class 'dict'> for key "eval/class_metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(

Trainer is attempting to log a value of "{'Negative': {'Precision': 0.4714285714285714, 'Recall': 0.39759036144578314, 'F1-score': 0.43137254901960786}, 'Neutral': {'Precision': 0.25757575757575757, 'Recall': 0.05802047781569966, 'F1-score': 0.0947075208913649}, 'Positive': {'Precision': 0.9523809523809523, 'Recall': 0.9896397092933354, 'F1-score': 0.9706529157503602}}" of type <class 'dict'> for key "eval/class_metrics" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.



✅ DistilBERT Performance:
Model achieved an accuracy of 93.60% on the validation dataset.
Class Negative: Precision=47.14%, Recall=39.76%, F1-score=43.14%
Class Neutral: Precision=25.76%, Recall=5.80%, F1-score=9.47%
Class Positive: Precision=95.24%, Recall=98.96%, F1-score=97.07%

✅ Confusion Matrix:
[[  66   12   88]
 [  44   17  232]
 [  30   37 6400]]
