In [22]:
#Imports

import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import re

In [23]:
#Load a subset of Amazon Polarity dataset
train_dataset = load_dataset("amazon_polarity", split='train[:5000]')
test_dataset = load_dataset("amazon_polarity", split='test[:1000]')

In [24]:
#Convert to pandas DataFrame
train_df = pd.DataFrame(train_dataset)
test_df = pd.DataFrame(test_dataset)

In [25]:
#Map labels: 0 -> Complaint, 1 -> Praise
train_df['category'] = train_df['label'].apply(lambda x: 'Complaint' if x==0 else 'Praise')
test_df['category'] = test_df['label'].apply(lambda x: 'Complaint' if x==0 else 'Praise')

In [26]:
#Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text

train_df['cleaned'] = train_df['content'].apply(clean_text)
test_df['cleaned'] = test_df['content'].apply(clean_text)

In [27]:
#Encode labels
le = LabelEncoder()
train_df['label_enc'] = le.fit_transform(train_df['category'])
test_df['label_enc'] = le.transform(test_df['category'])

In [28]:
#Convert to Hugging Face Dataset
train_dataset_hf = Dataset.from_pandas(train_df[['cleaned','label_enc']].rename(columns={'cleaned':'text','label_enc':'label'}))
test_dataset_hf = Dataset.from_pandas(test_df[['cleaned','label_enc']].rename(columns={'cleaned':'text','label_enc':'label'}))

In [29]:
#Tokenize text
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=64)

train_dataset_hf = train_dataset_hf.map(tokenize, batched=True)
test_dataset_hf = test_dataset_hf.map(tokenize, batched=True)

train_dataset_hf.set_format('torch', columns=['input_ids','attention_mask','label'])
test_dataset_hf.set_format('torch', columns=['input_ids','attention_mask','label'])

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [30]:
#Define model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
#Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'precision': precision, 'recall': recall, 'f1': f1}

In [32]:
#Define training arguments (optimized for speed)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    fp16=False,
    max_steps = 200
)

In [33]:
#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_hf,
    eval_dataset=test_dataset_hf,
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [34]:
#Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,0.3624,0.350346,0.856,0.85732,0.856,0.855888


TrainOutput(global_step=200, training_loss=0.4209760570526123, metrics={'train_runtime': 733.9146, 'train_samples_per_second': 4.36, 'train_steps_per_second': 0.273, 'total_flos': 52986959462400.0, 'train_loss': 0.4209760570526123, 'epoch': 0.6389776357827476})

In [35]:
# Evaluate
trainer.evaluate()

{'eval_loss': 0.3503459095954895,
 'eval_accuracy': 0.856,
 'eval_precision': 0.8573203004622496,
 'eval_recall': 0.856,
 'eval_f1': 0.8558876040203177,
 'eval_runtime': 35.3259,
 'eval_samples_per_second': 28.308,
 'eval_steps_per_second': 1.783,
 'epoch': 0.6389776357827476}

In [36]:
# Save model & tokenizer
model_dir = "customer_feedback_bert"
os.makedirs(model_dir, exist_ok=True)
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print(f"Model and tokenizer saved to {model_dir}")

Model and tokenizer saved to customer_feedback_bert
