<a href="https://colab.research.google.com/github/jgphelan/DellBERT/blob/main/FusionTech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DellBERT

A Sentiment and Topic Classifier based on collected costumer reviews.

**Make sure to upload and put the .csv file into the sample_data directory!**

Name the file "FusionTech Online Reviews Data Set.csv".

## Environment

In [None]:
# getting weird env issues so just uninstalled a bunch of conflicting packages
!pip uninstall -y tsfresh thinc gensim pyLDAvis
!pip install -q pandas emoji==2.11.0 unidecode langdetect
!pip install --no-cache-dir -q \
      "numpy==2.0.0" \
      "scipy==1.14.0" \
      "scikit-learn==1.5.0"
!pip install -q keybert sentence-transformers
!pip install -q --upgrade "transformers>=4.42.0" "accelerate>=0.29.0" \
                            "peft>=0.10.0" "datasets>=2.19.0" evaluate

import numpy as np, scipy, sklearn, pandas as pd, re, emoji, string, gc, torch, platform, transformers
from unidecode import unidecode
from langdetect import detect, LangDetectException
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from evaluate import load as load_metric
from torch.nn import functional as F
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training




# make sure we're running a T4 instance
print("PyTorch CUDA? ", torch.cuda.is_available(),
      "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

## Data Loading, Pre-Processing, Cleaning





In [None]:
df = pd.read_csv("sample_data/FusionTech Online Reviews Data Set.csv")[["text", "rating"]] \
       .rename(columns={"text":"review_text", "rating":"stars"})

# basic filters
df.dropna(subset=["review_text", "stars"], inplace=True)
df.drop_duplicates(subset="review_text", inplace=True)

# english filtering

def is_english(txt: str) -> bool:
    try:
        return detect(txt) == "en"
    except LangDetectException:          # empty / gibberish strings
        return False

df = df[df["review_text"].apply(is_english)].reset_index(drop=True)

# emoji / non-ASCII stripper
def normalise(t):
    t = unidecode(str(t))                # strips accents
    t = emoji.replace_emoji(t, "")       # remove emojis
    t = t.lower()
    t = re.sub(r"https?://\S+", "", t)   # URLs
    t = re.sub(r"[^\w\s" + re.escape(string.punctuation) + "]", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["clean_text"] = df["review_text"].apply(normalise)
df["token_cnt"]  = df["clean_text"].str.split().str.len()
df = df[df["token_cnt"].between(5, 512)].reset_index(drop=True)

# three-class sentiment labels
df["sentiment"] = df["stars"].apply(
        lambda x: "positive" if x >= 4 else "negative" if x <= 2 else "neutral"
)

df[["clean_text","sentiment","stars"]].to_csv("clean_reviews.csv", index=False)
print("Saved:", df.shape)
del df; gc.collect()

EDA

Topics

In [None]:
# load data
df = pd.read_csv("clean_reviews.csv")

# extra SW
extra_sw = ["fusiontech", "br", "amazon", "abc"]
# doc term matrix
cv = CountVectorizer(lowercase=True, stop_words=list(extra_sw) + ["english"], min_df=10, max_df=0.40)

dtm = cv.fit_transform(df["clean_text"]) # shape (n_docs, n_terms)
vocab = cv.get_feature_names_out()

# fit a 10-topic LDA model

lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=20, random_state=42, n_jobs=-1)
doc_topic = lda.fit_transform(dtm)

# attach dom topic to each review
df["topic_id"]   = doc_topic.argmax(axis=1)    # index of highest-prob topic
df["topic_prob"] = doc_topic.max(axis=1)       # its probability

df.to_csv("with_topics.csv", index=False)
print("Saved with_topics.csv  →", df.shape, "rows")

def print_topics(model, vocab, topn=10):
    for k, comp in enumerate(model.components_):
        words = vocab[np.argsort(comp)[-topn:]][::-1]
        print(f"Topic {k:2d}: {'  '.join(words)}")

print_topics(lda, vocab)


## LLM Naming Topics based on the LDA above

Auto-named each time LDA runs

In [None]:
kw_model = KeyBERT("all-MiniLM-L6-v2")

# words we never want in the final topic label
generic_sw = {
    "gaming", "game", "laptop", "laptops", "computer",
    "fusiontech", "amazon", "pc", "device", "andromeda"
}

# filler & verb stop-words for cleanup
filler   = {"and", "with", "has", "have", "get", "the", "a", "an"}
verb_sw  = {"run", "runs", "running", "buy", "bought"}

# shortens keywords to 2-3
def polish(raw: str) -> str:
    words = [w for w in raw.lower().split() if w not in filler]
    # remove leading verbs
    while words and words[0] in verb_sw:
        words.pop(0)
    # Ensure 2 words
    if len(words) < 2:
        return ""
    # swap order for patterns
    if len(words) == 2 and words[1].endswith("ed"):
        words = words[::-1]
    return " ".join(w.title() for w in words[:3])   # max 3 words

vocab   = cv.get_feature_names_out()
labels  = {}

for k in range(lda.n_components):
    # one best-fit review for topic k
    best_idx  = doc_topic[:, k].argmax()
    best_text = df.loc[best_idx, "clean_text"]
    # KeyBERT to raw phrase
    raw_phrase = kw_model.extract_keywords(
                    best_text,
                    keyphrase_ngram_range=(1, 3),
                    stop_words=list(generic_sw) + ["english"],
                    top_n=1
                 )[0][0]
    # polish + fallback
    phrase = polish(raw_phrase)
    if len(phrase.split()) < 2:
        top = [w for w in vocab[lda.components_[k].argsort()[::-1]]
               if w not in generic_sw][:2]
        phrase = " ".join(w.title() for w in top)

    labels[k] = phrase or f"Topic {k}"

print(labels)

# map into CSV + save
df["topic_id"] = df["topic_id"].astype(int).map(labels)
df.to_csv("with_topics.csv", index=False)


## Aggregate Topics
Produces summary .CSV for Customer Service Team

In [None]:
df = pd.read_csv("with_topics.csv")

# mapping for this file (redundant I know, stay with me)
def star_to_sentiment(x):
    if x >= 4:
        return "positive"
    elif x <= 2:
        return "negative"
    return "neutral"

df["sentiment"] = df["stars"].apply(star_to_sentiment)

# aggregate counts for each topic
summary = (
    df.groupby("topic_id")["sentiment"]
      .value_counts()
      .unstack(fill_value=0)                 # columns: negative/neutral/positive
      .rename(columns={
          "negative": "n_negative",
          "neutral" : "n_neutral",
          "positive": "n_positive"
      })
      .assign(total=lambda t: t.sum(axis=1)) # add a total column
      .reset_index()
      .sort_values("n_negative", ascending=False)  # surface pain-points first
)

# save new csv
summary.to_csv("topic_sentiment_summary.csv", index=False)

# test print
print("\n", summary.head(10))

## Sentiment Analysis Training

This next task involves finetuning BeRT for a sentiment analysis classification task on future unlabeled reviews from a wide variety of sources. It will output a negative, neutral, positive given a review.


2.1 Load and Encode

In [None]:
df = pd.read_csv("clean_reviews.csv")          # clean_text | sentiment | stars
label2id = {"negative":0, "neutral":1, "positive":2}
df["label"] = df["sentiment"].map(label2id)

# Handle Class imbalance
class_counts = df["label"].value_counts().sort_index()   # 0,1,2 order
print(class_counts)   # negative / neutral / positive

weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
weights = weights / weights.sum() * len(class_counts)    # mean≈1
print("Class weights:", weights)

2.2 Tokenizer etc.

In [None]:
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def encode(batch):
    return tok(batch["clean_text"],
               truncation=True, padding="max_length", max_length=128)

## Stratified 5-Fold CV with class weighted loss

In [None]:
skf     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metric_acc = load_metric("accuracy")
metric_f1w = load_metric("f1")
metric_f1m = load_metric("f1")

fold_results = []

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.cross_entropy(outputs.logits, labels,
                               weight=weights.to(model.device))
        return (loss, outputs) if return_outputs else loss

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["label"]), 1):
    print(f"\n—— Fold {fold} ——")
    train_ds = Dataset.from_pandas(df.iloc[train_idx][["clean_text","label"]]).map(encode, batched=True)
    val_ds   = Dataset.from_pandas(df.iloc[val_idx][["clean_text","label"]]).map(encode, batched=True)

    base = AutoModelForSequenceClassification.from_pretrained(
                "distilbert-base-uncased", num_labels=3)
    base = prepare_model_for_kbit_training(base, use_gradient_checkpointing=True)
    model = get_peft_model(base, LoraConfig(
                r=16, lora_alpha=32, lora_dropout=0.05,
                bias="none", target_modules=["q_lin","v_lin"],
                task_type="SEQ_CLS"))

    args = TrainingArguments(
        output_dir=f"chk_fold{fold}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="no",
        fp16=True,
        report_to="none"
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(-1)
        return {
            "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
            "f1_weighted": metric_f1w.compute(predictions=preds, references=labels,
                                              average="weighted")["f1"],
            "f1_macro":    metric_f1m.compute(predictions=preds, references=labels,
                                              average="macro")["f1"]
        }

    trainer = WeightedLossTrainer(model=model, args=args,
                      train_dataset=train_ds,
                      eval_dataset=val_ds,
                      compute_metrics=compute_metrics)
    trainer.train()
    res = trainer.evaluate()
    fold_results.append(res)
    print(res)

Summary

In [None]:
cv_df = pd.DataFrame(fold_results)
print("\nMean across folds:")
print(cv_df[["eval_accuracy","eval_f1_weighted","eval_f1_macro"]].mean())

Final Training on all data

In [None]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.cross_entropy(outputs.logits, labels,
                               weight=weights.to(model.device))
        return (loss, outputs) if return_outputs else loss

full_ds = Dataset.from_pandas(df[["clean_text","label"]]).map(encode, batched=True)

base = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=3)
base = prepare_model_for_kbit_training(base, use_gradient_checkpointing=True)
model = get_peft_model(base, LoraConfig(
            r=16, lora_alpha=32, lora_dropout=0.05,
            bias="none", target_modules=["q_lin","v_lin"],
            task_type="SEQ_CLS"))


trainer = WeightedLossTrainer(
    model=model,
    args=TrainingArguments(
        output_dir="sentiment_final",
        per_device_train_batch_size=16,
        num_train_epochs=3,
        fp16=True,
        save_total_limit=1,
        report_to="none"
    ),
    train_dataset=full_ds
)
trainer.train()
model.save_pretrained("sentiment_final")
tok.save_pretrained("sentiment_final")
print("Final model saved to sentiment_final/")

## Demo

In [None]:
from peft import PeftModel

MODEL_DIR = "sentiment_final"
BASE_CKPT = "distilbert-base-uncased"  # same backbone used during training
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"

# load tokenzier
tok = AutoTokenizer.from_pretrained(BASE_CKPT)

# Frozen base model with 3 labels
base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_CKPT,
    num_labels=3
).to(DEVICE)

# LoRA adapter
model = PeftModel.from_pretrained(base_model, MODEL_DIR).to(DEVICE)
model.eval()

id2label = {0: "negative", 1: "neutral", 2: "positive"}

def predict_sent(texts):
    enc = tok(texts, padding=True, truncation=True, max_length=128,
              return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**enc).logits
    preds = logits.argmax(-1).cpu().tolist()
    return [id2label[p] for p in preds]

# Demo
sample_reviews = [
    "Battery dies after 40 minutes – extremely disappointed.",
    "Works great for everyday tasks. But then it gets too hot, not terrible though.",
    "Absolutely love the performance and build quality! 10/10 purchase.",
    "Keyboard is okay, but the fan noise gets a bit loud under load.",
    "Arrived DOA: black screen and constant beeping. Had to return it."
]

results = predict_sent(sample_reviews)
pd.DataFrame({"review_text": sample_reviews,
              "predicted_sentiment": results})