In [1]:
!pip install transformers datasets torch accelerate -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from google.colab import files

In [15]:
# Load the dataset
df = pd.read_csv( '/content/Reviews.csv')
# Keep only 'text' and 'score', drop NaN
df = df[['Text', 'Score']].dropna()


def score_to_label(score):
    if score <= 2:
        return 0  # Negative
    elif score == 3:
        return 2  # Neutral
    else:
        return 1  # Positive

df['label'] = df['Score'].apply(score_to_label)


df = df.sample(n=20000, random_state=42)

# Split into train and eval
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)


In [16]:
# Step 2: Load Tokenizer and Model
model_name = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes: neg, pos, neutral

# Step 3: Preprocess Data
def preprocess_function(examples):
    return tokenizer(examples['Text'], truncation=True, padding='max_length', max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

In [7]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [18]:
# Step 4: Define Training Arguments (Optimized for Colab/Kaggle)
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    warmup_steps=50,
    gradient_accumulation_steps=2,
    fp16=True if torch.cuda.is_available() else False,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=50,
    save_total_limit=1,
    report_to="none",
)


import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    return accuracy



In [19]:
# Step 6: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Step 7: Fine-Tune the Model
print("Starting training...")
trainer.train()

# Step 8: Save the Model
model.save_pretrained("./sentiment_model/final")
tokenizer.save_pretrained("./sentiment_model/final")
print("Model and tokenizer saved to ./sentiment_model/final")


Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3085,0.363795,0.8655
2,0.2473,0.390903,0.8715
3,0.1827,0.432628,0.8665
4,0.1644,0.509646,0.868
5,0.0824,0.556629,0.87


Model and tokenizer saved to ./sentiment_model/final


In [30]:
# Load libraries (assuming already installed from training)
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification

# Load your saved model and tokenizer
model_path = "./sentiment_model/final"
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Inference function with confidence scores
def predict_sentiment(review, model, tokenizer, device=device):
    model.eval()
    inputs = tokenizer(review, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()
        probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()  # Get probabilities
    sentiment = {0: "Negative", 1: "Positive", 2: "Neutral"}[prediction]
    return {"sentiment": sentiment, "confidence": float(max(probs))}



In [31]:
# Test with sample reviews
test_reviews = [
    "This product is amazing, totally worth the price!",
    "Terrible quality, broke in two days.",
    "It’s okay, nothing special.",
    "Super fast delivery but the packaging was meh.",
    "Worst purchase ever, complete waste of money!"
]

print("Testing the Sentiment Classifier:")
for review in test_reviews:
    result = predict_sentiment(review, model, tokenizer)
    print(f"Review: {review}")
    print(f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.2f})\n")

Testing the Sentiment Classifier:
Review: This product is amazing, totally worth the price!
Sentiment: Positive (Confidence: 1.00)

Review: Terrible quality, broke in two days.
Sentiment: Negative (Confidence: 0.96)

Review: It’s okay, nothing special.
Sentiment: Neutral (Confidence: 0.63)

Review: Super fast delivery but the packaging was meh.
Sentiment: Neutral (Confidence: 0.47)

Review: Worst purchase ever, complete waste of money!
Sentiment: Negative (Confidence: 0.96)

