<a href="https://colab.research.google.com/github/jgphelan/DellBERT/blob/main/FusionTech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DellBERT

A Sentiment and Topic Classifier based on collected costumer reviews.

**Make sure to upload and put the .csv file into the sample_data directory!**

Name the file "FusionTech Online Reviews Data Set.csv".

## Environment

In [3]:
# getting weird env issues so just uninstalled a bunch of conflicting packages
!pip uninstall -y tsfresh thinc gensim pyLDAvis
!pip install -q pandas emoji==2.11.0 unidecode langdetect
!pip install --no-cache-dir -q \
      "numpy==2.0.0" \
      "scipy==1.14.0" \
      "scikit-learn==1.5.0"
!pip install -q keybert sentence-transformers
!pip install -q --upgrade "transformers>=4.42.0" "accelerate>=0.29.0" \
                            "peft>=0.10.0" "datasets>=2.19.0" evaluate

import numpy as np, scipy, sklearn, pandas as pd, re, emoji, string, gc, torch, platform, transformers
from unidecode import unidecode
from langdetect import detect, LangDetectException
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from keybert import KeyBERT
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from evaluate import load as load_metric
from torch.nn import functional as F
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training




# make sure we're running a T4 instance
print("PyTorch CUDA? ", torch.cuda.is_available(),
      "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

Found existing installation: tsfresh 0.21.0
Uninstalling tsfresh-0.21.0:
  Successfully uninstalled tsfresh-0.21.0
Found existing installation: thinc 8.3.6
Uninstalling thinc-8.3.6:
  Successfully uninstalled thinc-8.3.6
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m433.8/433.8 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m134.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Exception ignored on calling ctypes callback function: <function ThreadpoolController._find_libraries_with_dl_iterate_phdr.<locals>.match_library_callback at 0x78cb4965f240>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1005, in match_library_callback
    self._make_controller_from_path(filepath)
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 1187, in _make_controller_from_path
    lib_controller = controller_class(
                     ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/threadpoolctl.py", line 114, in __init__
    self.dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/ctypes/__init__.py", line 376, in __init__
    self._handle = _dlopen(self._name, mode)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^
OSError: dlopen() error


PyTorch CUDA?  True | GPU: Tesla T4


## Data Loading, Pre-Processing, Cleaning





In [4]:
df = pd.read_csv("sample_data/FusionTech Online Reviews Data Set.csv")[["text", "rating"]] \
       .rename(columns={"text":"review_text", "rating":"stars"})

# basic filters
df.dropna(subset=["review_text", "stars"], inplace=True)
df.drop_duplicates(subset="review_text", inplace=True)

# english filtering

def is_english(txt: str) -> bool:
    try:
        return detect(txt) == "en"
    except LangDetectException:          # empty / gibberish strings
        return False

df = df[df["review_text"].apply(is_english)].reset_index(drop=True)

# emoji / non-ASCII stripper
def normalise(t):
    t = unidecode(str(t))                # strips accents
    t = emoji.replace_emoji(t, "")       # remove emojis
    t = t.lower()
    t = re.sub(r"https?://\S+", "", t)   # URLs
    t = re.sub(r"[^\w\s" + re.escape(string.punctuation) + "]", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["clean_text"] = df["review_text"].apply(normalise)
df["token_cnt"]  = df["clean_text"].str.split().str.len()
df = df[df["token_cnt"].between(5, 512)].reset_index(drop=True)

# three-class sentiment labels
df["sentiment"] = df["stars"].apply(
        lambda x: "positive" if x >= 4 else "negative" if x <= 2 else "neutral"
)

df[["clean_text","sentiment","stars"]].to_csv("clean_reviews.csv", index=False)
print("Saved:", df.shape)
del df; gc.collect()

Saved: (3481, 5)


0

EDA

Topics

In [5]:
# load data
df = pd.read_csv("clean_reviews.csv")

# extra SW
extra_sw = ["fusiontech", "br", "amazon", "abc"]
# doc term matrix
cv = CountVectorizer(lowercase=True, stop_words=list(extra_sw) + ["english"], min_df=10, max_df=0.40)

dtm = cv.fit_transform(df["clean_text"]) # shape (n_docs, n_terms)
vocab = cv.get_feature_names_out()

# fit a 10-topic LDA model

lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=20, random_state=42, n_jobs=-1)
doc_topic = lda.fit_transform(dtm)

# attach dom topic to each review
df["topic_id"]   = doc_topic.argmax(axis=1)    # index of highest-prob topic
df["topic_prob"] = doc_topic.max(axis=1)       # its probability

df.to_csv("with_topics.csv", index=False)
print("Saved with_topics.csv  →", df.shape, "rows")

def print_topics(model, vocab, topn=10):
    for k, comp in enumerate(model.components_):
        words = vocab[np.argsort(comp)[-topn:]][::-1]
        print(f"Topic {k:2d}: {'  '.join(words)}")

print_topics(lda, vocab)


Saved with_topics.csv  → (3481, 5) rows
Topic  0: you  if  are  can  be  have  so  at  get  good
Topic  1: my  have  computer  was  as  so  when  had  use  or
Topic  2: was  had  me  they  my  after  back  have  computer  out
Topic  3: has  have  after  my  months  as  use  power  from  warranty
Topic  4: games  gaming  at  can  play  you  settings  great  run  game
Topic  5: ssd  ram  drive  an  have  my  as  has  you  one
Topic  6: very  screen  great  good  keyboard  nice  was  fast  my  quality
Topic  7: my  so  computer  great  love  far  he  good  was  bought
Topic  8: drivers  windows  issues  from  driver  after  was  had  have  update
Topic  9: you  computer  have  buy  my  was  if  now  don  they


## LLM Naming Topics based on the LDA above

Auto-named each time LDA runs

In [6]:
kw_model = KeyBERT("all-MiniLM-L6-v2")

# words we never want in the final topic label
generic_sw = {
    "gaming", "game", "laptop", "laptops", "computer",
    "fusiontech", "amazon", "pc", "device", "andromeda"
}

# filler & verb stop-words for cleanup
filler   = {"and", "with", "has", "have", "get", "the", "a", "an"}
verb_sw  = {"run", "runs", "running", "buy", "bought"}

# shortens keywords to 2-3
def polish(raw: str) -> str:
    words = [w for w in raw.lower().split() if w not in filler]
    # remove leading verbs
    while words and words[0] in verb_sw:
        words.pop(0)
    # Ensure 2 words
    if len(words) < 2:
        return ""
    # swap order for patterns
    if len(words) == 2 and words[1].endswith("ed"):
        words = words[::-1]
    return " ".join(w.title() for w in words[:3])   # max 3 words

vocab   = cv.get_feature_names_out()
labels  = {}

for k in range(lda.n_components):
    # one best-fit review for topic k
    best_idx  = doc_topic[:, k].argmax()
    best_text = df.loc[best_idx, "clean_text"]
    # KeyBERT to raw phrase
    raw_phrase = kw_model.extract_keywords(
                    best_text,
                    keyphrase_ngram_range=(1, 3),
                    stop_words=list(generic_sw) + ["english"],
                    top_n=1
                 )[0][0]
    # polish + fallback
    phrase = polish(raw_phrase)
    if len(phrase.split()) < 2:
        top = [w for w in vocab[lda.components_[k].argsort()[::-1]]
               if w not in generic_sw][:2]
        phrase = " ".join(w.title() for w in top)

    labels[k] = phrase or f"Topic {k}"

print(labels)

# map into CSV + save
df["topic_id"] = df["topic_id"].astype(int).map(labels)
df.to_csv("with_topics.csv", index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{0: 'Better Touch Pad', 1: 'Battery Life', 2: 'Wrong Customer Service', 3: 'Purchased Warranty', 4: 'Issues Games', 5: 'Ssd Storage', 6: 'Keyboard Great', 7: 'Issues Lag', 8: 'Graphics Card Crashes', 9: 'Macbook Air'}


## Aggregate Topics
Produces summary .CSV for Customer Service Team

In [7]:
df = pd.read_csv("with_topics.csv")

# mapping for this file (redundant I know, stay with me)
def star_to_sentiment(x):
    if x >= 4:
        return "positive"
    elif x <= 2:
        return "negative"
    return "neutral"

df["sentiment"] = df["stars"].apply(star_to_sentiment)

# aggregate counts for each topic
summary = (
    df.groupby("topic_id")["sentiment"]
      .value_counts()
      .unstack(fill_value=0)                 # columns: negative/neutral/positive
      .rename(columns={
          "negative": "n_negative",
          "neutral" : "n_neutral",
          "positive": "n_positive"
      })
      .assign(total=lambda t: t.sum(axis=1)) # add a total column
      .reset_index()
      .sort_values("n_negative", ascending=False)  # surface pain-points first
)

# save new csv
summary.to_csv("topic_sentiment_summary.csv", index=False)

# test print
print("\n", summary.head(10))


 sentiment                topic_id  n_negative  n_neutral  n_positive  total
9          Wrong Customer Service         324         44          70    438
0                    Battery Life         156         48         257    461
7              Purchased Warranty         131         40          45    216
6                     Macbook Air         130         22         111    263
1                Better Touch Pad          96         60         358    514
2           Graphics Card Crashes          85         28         110    223
5                  Keyboard Great          28         23         360    411
3                    Issues Games          25         20         366    411
8                     Ssd Storage          18         27         269    314
4                      Issues Lag           8          0         222    230


## Sentiment Analysis Training

This next task involves finetuning BeRT for a sentiment analysis classification task on future unlabeled reviews from a wide variety of sources. It will output a negative, neutral, positive given a review.


2.1 Load and Encode

In [8]:
df = pd.read_csv("clean_reviews.csv")          # clean_text | sentiment | stars
label2id = {"negative":0, "neutral":1, "positive":2}
df["label"] = df["sentiment"].map(label2id)

# Handle Class imbalance
class_counts = df["label"].value_counts().sort_index()   # 0,1,2 order
print(class_counts)   # negative / neutral / positive

weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
weights = weights / weights.sum() * len(class_counts)    # mean≈1
print("Class weights:", weights)

label
0    1001
1     312
2    2168
Name: count, dtype: int64
Class weights: tensor([0.6424, 2.0610, 0.2966])


2.2 Tokenizer etc.

In [9]:
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def encode(batch):
    return tok(batch["clean_text"],
               truncation=True, padding="max_length", max_length=128)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Stratified 5-Fold CV with class weighted loss

In [10]:
skf     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metric_acc = load_metric("accuracy")
metric_f1w = load_metric("f1")
metric_f1m = load_metric("f1")

fold_results = []

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.cross_entropy(outputs.logits, labels,
                               weight=weights.to(model.device))
        return (loss, outputs) if return_outputs else loss

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["label"]), 1):
    print(f"\n—— Fold {fold} ——")
    train_ds = Dataset.from_pandas(df.iloc[train_idx][["clean_text","label"]]).map(encode, batched=True)
    val_ds   = Dataset.from_pandas(df.iloc[val_idx][["clean_text","label"]]).map(encode, batched=True)

    base = AutoModelForSequenceClassification.from_pretrained(
                "distilbert-base-uncased", num_labels=3)
    base = prepare_model_for_kbit_training(base, use_gradient_checkpointing=True)
    model = get_peft_model(base, LoraConfig(
                r=16, lora_alpha=32, lora_dropout=0.05,
                bias="none", target_modules=["q_lin","v_lin"],
                task_type="SEQ_CLS"))

    args = TrainingArguments(
        output_dir=f"chk_fold{fold}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="no",
        fp16=True,
        report_to="none"
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(-1)
        return {
            "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
            "f1_weighted": metric_f1w.compute(predictions=preds, references=labels,
                                              average="weighted")["f1"],
            "f1_macro":    metric_f1m.compute(predictions=preds, references=labels,
                                              average="macro")["f1"]
        }

    trainer = WeightedLossTrainer(model=model, args=args,
                      train_dataset=train_ds,
                      eval_dataset=val_ds,
                      compute_metrics=compute_metrics)
    trainer.train()
    res = trainer.evaluate()
    fold_results.append(res)
    print(res)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]


—— Fold 1 ——


Map:   0%|          | 0/2784 [00:00<?, ? examples/s]

Map:   0%|          | 0/697 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.784223,0.802009,0.779202,0.582918
2,No log,0.739338,0.74462,0.767722,0.610349
3,0.820500,0.734908,0.746055,0.771056,0.616187


{'eval_loss': 0.7349084615707397, 'eval_accuracy': 0.7460545193687231, 'eval_f1_weighted': 0.771055695438054, 'eval_f1_macro': 0.6161869129619058, 'eval_runtime': 0.7855, 'eval_samples_per_second': 887.344, 'eval_steps_per_second': 56.016, 'epoch': 3.0}

—— Fold 2 ——


Map:   0%|          | 0/2785 [00:00<?, ? examples/s]

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.780669,0.758621,0.781991,0.632948
2,No log,0.750028,0.737069,0.766436,0.613441
3,0.815300,0.743273,0.738506,0.77178,0.623187


{'eval_loss': 0.7432729601860046, 'eval_accuracy': 0.7385057471264368, 'eval_f1_weighted': 0.7717798113913983, 'eval_f1_macro': 0.6231866479683185, 'eval_runtime': 0.811, 'eval_samples_per_second': 858.209, 'eval_steps_per_second': 54.255, 'epoch': 3.0}

—— Fold 3 ——


Map:   0%|          | 0/2785 [00:00<?, ? examples/s]

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.780073,0.765805,0.773017,0.611226
2,No log,0.755695,0.762931,0.778031,0.633919
3,0.812600,0.753953,0.758621,0.778598,0.63094


{'eval_loss': 0.7539530992507935, 'eval_accuracy': 0.7586206896551724, 'eval_f1_weighted': 0.7785980371559432, 'eval_f1_macro': 0.6309398964624163, 'eval_runtime': 0.9139, 'eval_samples_per_second': 761.577, 'eval_steps_per_second': 48.146, 'epoch': 3.0}

—— Fold 4 ——


Map:   0%|          | 0/2785 [00:00<?, ? examples/s]

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.756142,0.770115,0.774403,0.607601
2,No log,0.716415,0.744253,0.772212,0.619027
3,0.821600,0.709155,0.744253,0.773605,0.625113


{'eval_loss': 0.7091549634933472, 'eval_accuracy': 0.7442528735632183, 'eval_f1_weighted': 0.7736050724861346, 'eval_f1_macro': 0.6251129398719842, 'eval_runtime': 0.8612, 'eval_samples_per_second': 808.207, 'eval_steps_per_second': 51.094, 'epoch': 3.0}

—— Fold 5 ——


Map:   0%|          | 0/2785 [00:00<?, ? examples/s]

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.762832,0.768678,0.770971,0.589564
2,No log,0.737818,0.788793,0.796276,0.625549
3,0.822000,0.726196,0.771552,0.789762,0.620457


{'eval_loss': 0.7261963486671448, 'eval_accuracy': 0.771551724137931, 'eval_f1_weighted': 0.7897621081601568, 'eval_f1_macro': 0.6204570314851727, 'eval_runtime': 0.8449, 'eval_samples_per_second': 823.784, 'eval_steps_per_second': 52.078, 'epoch': 3.0}


Summary

In [11]:
cv_df = pd.DataFrame(fold_results)
print("\nMean across folds:")
print(cv_df[["eval_accuracy","eval_f1_weighted","eval_f1_macro"]].mean())


Mean across folds:
eval_accuracy       0.751797
eval_f1_weighted    0.776960
eval_f1_macro       0.623177
dtype: float64


Final Training on all data

In [12]:
class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.cross_entropy(outputs.logits, labels,
                               weight=weights.to(model.device))
        return (loss, outputs) if return_outputs else loss

full_ds = Dataset.from_pandas(df[["clean_text","label"]]).map(encode, batched=True)

base = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=3)
base = prepare_model_for_kbit_training(base, use_gradient_checkpointing=True)
model = get_peft_model(base, LoraConfig(
            r=16, lora_alpha=32, lora_dropout=0.05,
            bias="none", target_modules=["q_lin","v_lin"],
            task_type="SEQ_CLS"))


trainer = WeightedLossTrainer(
    model=model,
    args=TrainingArguments(
        output_dir="sentiment_final",
        per_device_train_batch_size=16,
        num_train_epochs=3,
        fp16=True,
        save_total_limit=1,
        report_to="none"
    ),
    train_dataset=full_ds
)
trainer.train()
model.save_pretrained("sentiment_final")
tok.save_pretrained("sentiment_final")
print("Final model saved to sentiment_final/")

Map:   0%|          | 0/3481 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
500,0.8157


Final model saved to sentiment_final/


## Demo

In [21]:
from peft import PeftModel

MODEL_DIR = "sentiment_final"
BASE_CKPT = "distilbert-base-uncased"  # same backbone used during training
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"

# load tokenzier
tok = AutoTokenizer.from_pretrained(BASE_CKPT)

# Frozen base model with 3 labels
base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_CKPT,
    num_labels=3
).to(DEVICE)

# LoRA adapter
model = PeftModel.from_pretrained(base_model, MODEL_DIR).to(DEVICE)
model.eval()

id2label = {0: "negative", 1: "neutral", 2: "positive"}

def predict_sent(texts):
    enc = tok(texts, padding=True, truncation=True, max_length=128,
              return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**enc).logits
    preds = logits.argmax(-1).cpu().tolist()
    return [id2label[p] for p in preds]

# Demo
sample_reviews = [
    "Battery dies after 40 minutes – extremely disappointed.",
    "Works great for everyday tasks. But then it gets too hot, not terrible though.",
    "Absolutely love the performance and build quality! 10/10 purchase.",
    "Keyboard is okay, but the fan noise gets a bit loud under load.",
    "Arrived DOA: black screen and constant beeping. Had to return it."
]

results = predict_sent(sample_reviews)
pd.DataFrame({"review_text": sample_reviews,
              "predicted_sentiment": results})

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,review_text,predicted_sentiment
0,Battery dies after 40 minutes – extremely disa...,negative
1,Works great for everyday tasks. But then it ge...,neutral
2,Absolutely love the performance and build qual...,positive
3,"Keyboard is okay, but the fan noise gets a bit...",neutral
4,Arrived DOA: black screen and constant beeping...,negative
