In [4]:
# Imports the necessary libraries
import torch
import pandas as pd
import numpy as np
import re
import string
import nltk
import pickle
import json
import pyarrow as pa
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset as HFDataset, Features, Value
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
# Loads the dataset
df = pd.read_csv(r"C:\Users\igriz\.cache\kagglehub\datasets\datafiniti\consumer-reviews-of-amazon-products\versions\5\1429_1.csv", low_memory=False)
pickle.dump(df, open("dataset.pkl", "wb"))
# Drops unnecessary columns
columns_to_drop = [
    'asins', 'reviews.date', 'reviews.dateAdded', 'reviews.dateSeen',
    'reviews.id', 'reviews.didPurchase', 'name',
    'reviews.userCity', 'reviews.userProvince', 'reviews.sourceURLs'
]
df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])
pickle.dump(df, open("cleaned_dataset.pkl", "wb"))




In [5]:
# Defines text preprocessing function
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"\d+", "", text)  # Removes numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Removes punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Removes extra spaces
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words("english"))  # Removes stopwords
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()  # Lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

In [6]:
# Applies text preprocessing
if 'reviews.text' in df.columns:
    df["cleaned_reviews_text"] = df["reviews.text"].apply(preprocess_text)

In [7]:
# Drops rows with missing values in relevant columns
df = df.dropna(subset=["cleaned_reviews_text", "reviews.rating"])
# Converts 'reviews.rating' to integer
df["reviews.rating"] = df["reviews.rating"].astype(int)
# Label transformations (1-2 → Negative, 3 → Neutral, 4-5 → Positive)
def transform_labels(rating):
    return 0 if rating <= 2 else (1 if rating == 3 else 2)
df["label"] = df["reviews.rating"].apply(transform_labels)

In [8]:
# ConvertS to Hugging Face Dataset
features = Features({
    "cleaned_reviews_text": Value("string"),
    "label": Value("int32")
})
hf_dataset = HFDataset.from_pandas(df[["cleaned_reviews_text", "label"]], features=features, preserve_index=False)

In [9]:
# Tokenizer Initialization
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [10]:
# Define Tokenization Function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_reviews_text"], padding="max_length", truncation=True, max_length=256)

In [11]:
# Applies Tokenization
hf_dataset = hf_dataset.map(tokenize_function, batched=True)
# Train-Test Split
hf_dataset = hf_dataset.train_test_split(test_size=0.2, seed=42)

Map:   0%|          | 0/34627 [00:00<?, ? examples/s]

In [12]:
# Loads the pretrained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)
# Defines training arguments
training_args = TrainingArguments(
    output_dir="./distilbert_results",
    eval_strategy="epoch",  # :white_check_mark: FIXED
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=200,
    metric_for_best_model="accuracy"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Defines performance metrics
from sklearn.metrics import classification_report, confusion_matrix

In [14]:
# Defines performance metrics
def compute_metrics(pred):
    preds = pred.predictions.argmax(-1)
    labels = pred.label_ids
    # Computes overall metrics
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    precision = precision_score(labels, preds, average="weighted")
    recall = recall_score(labels, preds, average="weighted")
    # Computes per-class metrics
    class_report = classification_report(labels, preds, target_names=["Negative", "Neutral", "Positive"], output_dict=True)
    # Extracts per-class precision, recall, and F1-score
    class_metrics = {
        "Negative": {"Precision": class_report["Negative"]["precision"], "Recall": class_report["Negative"]["recall"], "F1-score": class_report["Negative"]["f1-score"]},
        "Neutral": {"Precision": class_report["Neutral"]["precision"], "Recall": class_report["Neutral"]["recall"], "F1-score": class_report["Neutral"]["f1-score"]},
        "Positive": {"Precision": class_report["Positive"]["precision"], "Recall": class_report["Positive"]["recall"], "F1-score": class_report["Positive"]["f1-score"]}
    }
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall,
        "class_metrics": class_metrics
    }
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_dataset["train"],
    eval_dataset=hf_dataset["test"],
    compute_metrics=compute_metrics  # FIXED
)

In [28]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Number of GPUs
print(torch.cuda.get_device_name(0))  # GPU model name


False
0


AssertionError: Torch not compiled with CUDA enabled

In [26]:
# Trains the model
trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Saves the model
trainer.save_model("./distilbert_model")
# Evaluates the model
results = trainer.evaluate()
# Confusion Matrix
predictions = trainer.predict(hf_dataset["test"])
preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids
conf_matrix = confusion_matrix(labels, preds)
# Print overall performance
print("\n DistilBERT Performance:")
print(f"Model achieved an accuracy of {results['eval_accuracy']*100:.2f}% on the validation dataset.")
# Print per-class performance
class_metrics = results["eval_class_metrics"]
for class_name, metrics in class_metrics.items():
    print(f"Class {class_name}: Precision={metrics['Precision']*100:.2f}%, Recall={metrics['Recall']*100:.2f}%, F1-score={metrics['F1-score']*100:.2f}%")
# Print confusion matrix
print("\n Confusion Matrix:")
print(conf_matrix)
# Save confusion matrix to a CSV
conf_matrix_df = pd.DataFrame(conf_matrix, index=["Negative", "Neutral", "Positive"], columns=["Predicted Negative", "Predicted Neutral", "Predicted Positive"])
conf_matrix_df.to_csv("confusion_matrix.csv")