<a href="https://colab.research.google.com/github/jgphelan/DellBERT/blob/main/FusionTech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

FileNotFoundError: [Errno 2] No such file or directory: '-f'

## Environment

In [3]:
!pip uninstall -y tsfresh thinc gensim pyLDAvis

!pip install --no-cache-dir -q \
      "numpy==2.0.0" \
      "scipy==1.14.0" \
      "scikit-learn==1.5.0"

import numpy, scipy, sklearn, pandas as pd


import torch, platform, transformers, pandas as pd
print("PyTorch CUDA? ", torch.cuda.is_available(),
      "| GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

[0mPyTorch CUDA?  True | GPU: Tesla T4


## Data Loading, Pre-Processing, Cleaning





In [4]:
!pip install -q pandas emoji==2.11.0 unidecode langdetect
import pandas as pd, re, emoji, string, gc
from unidecode import unidecode
from langdetect import detect, LangDetectException

df = pd.read_csv("sample_data/FusionTech Online Reviews Data Set.csv")[["text", "rating"]] \
       .rename(columns={"text":"review_text", "rating":"stars"})

# basic filters
df.dropna(subset=["review_text", "stars"], inplace=True)
df.drop_duplicates(subset="review_text", inplace=True)

# english filtering

def is_english(txt: str) -> bool:
    try:
        return detect(txt) == "en"
    except LangDetectException:          # empty / gibberish strings
        return False

df = df[df["review_text"].apply(is_english)].reset_index(drop=True)

# emoji / non-ASCII stripper
def normalise(t):
    t = unidecode(str(t))                # strips accents
    t = emoji.replace_emoji(t, "")       # remove emojis
    t = t.lower()
    t = re.sub(r"https?://\S+", "", t)   # URLs
    t = re.sub(r"[^\w\s" + re.escape(string.punctuation) + "]", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t

df["clean_text"] = df["review_text"].apply(normalise)
df["token_cnt"]  = df["clean_text"].str.split().str.len()
df = df[df["token_cnt"].between(5, 512)].reset_index(drop=True)

# three-class sentiment labels
df["sentiment"] = df["stars"].apply(
        lambda x: "positive" if x >= 4 else "negative" if x <= 2 else "neutral"
)

df[["clean_text","sentiment","stars"]].to_csv("clean_reviews.csv", index=False)
print("Saved:", df.shape)
del df; gc.collect()

Saved: (3484, 5)


20

EDA

Topics

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# load data
df = pd.read_csv("clean_reviews.csv")

# extra SW
extra_sw = ["fusiontech", "br", "amazon", "abc"]
# doc term matrix
cv = CountVectorizer(lowercase=True, stop_words=list(extra_sw) + ["english"], min_df=10, max_df=0.40)

dtm = cv.fit_transform(df["clean_text"]) # shape (n_docs, n_terms)
vocab = cv.get_feature_names_out()

# fit a 10-topic LDA model

lda = LatentDirichletAllocation(n_components=10, learning_method="batch", max_iter=20, random_state=42, n_jobs=-1)
doc_topic = lda.fit_transform(dtm)

# attach dom topic to each review
df["topic_id"]   = doc_topic.argmax(axis=1)    # index of highest-prob topic
df["topic_prob"] = doc_topic.max(axis=1)       # its probability

df.to_csv("with_topics.csv", index=False)
print("Saved with_topics.csv  →", df.shape, "rows")

def print_topics(model, vocab, topn=10):
    for k, comp in enumerate(model.components_):
        words = vocab[np.argsort(comp)[-topn:]][::-1]
        print(f"Topic {k:2d}: {'  '.join(words)}")

print_topics(lda, vocab)


Saved with_topics.csv  → (3484, 5) rows
Topic  0: you  if  can  are  be  so  get  have  at  gaming
Topic  1: my  have  computer  was  as  so  had  use  when  or
Topic  2: was  had  they  me  computer  my  have  after  back  support
Topic  3: has  have  after  months  34  use  my  as  power  screen
Topic  4: games  gaming  at  can  play  you  settings  run  game  high
Topic  5: ssd  drive  ram  have  an  as  has  my  are  you
Topic  6: screen  very  great  keyboard  good  nice  was  quality  fast  my
Topic  7: my  great  so  love  computer  very  good  fast  far  he
Topic  8: drivers  was  after  driver  from  issues  windows  had  screen  wifi
Topic  9: you  have  was  my  computer  one  if  be  buy  what


## LLM Naming Topics based on the LDA above

Auto-named each time LDA runs

In [6]:
# ── one-time install (skip if already run) ────────────────────
!pip install -q keybert sentence-transformers

from keybert import KeyBERT
import numpy as np, pandas as pd, re

kw_model = KeyBERT("all-MiniLM-L6-v2")

# words we never want in the final topic label
generic_sw = {
    "gaming", "game", "laptop", "laptops", "computer",
    "fusiontech", "amazon", "pc", "device", "andromeda"
}

# filler & verb stop-words for cleanup
filler   = {"and", "with", "has", "have", "get", "the", "a", "an"}
verb_sw  = {"run", "runs", "running", "buy", "bought"}

# shortens keywords to 2-3
def polish(raw: str) -> str:
    words = [w for w in raw.lower().split() if w not in filler]
    # remove leading verbs
    while words and words[0] in verb_sw:
        words.pop(0)
    # Ensure 2 words
    if len(words) < 2:
        return ""
    # swap order for patterns
    if len(words) == 2 and words[1].endswith("ed"):
        words = words[::-1]
    return " ".join(w.title() for w in words[:3])   # max 3 words

vocab   = cv.get_feature_names_out()
labels  = {}

for k in range(lda.n_components):
    # one best-fit review for topic k
    best_idx  = doc_topic[:, k].argmax()
    best_text = df.loc[best_idx, "clean_text"]
    # KeyBERT --> raw phrase
    raw_phrase = kw_model.extract_keywords(
                    best_text,
                    keyphrase_ngram_range=(1, 3),
                    stop_words=list(generic_sw) + ["english"],
                    top_n=1
                 )[0][0]
    # polish + fallback
    phrase = polish(raw_phrase)
    if len(phrase.split()) < 2:
        top = [w for w in vocab[lda.components_[k].argsort()[::-1]]
               if w not in generic_sw][:2]
        phrase = " ".join(w.title() for w in top)

    labels[k] = phrase or f"Topic {k}"

print(labels)

# map into CSV + save
df["topic_id"] = df["topic_id"].astype(int).map(labels)
df.to_csv("with_topics.csv", index=False)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.4/41.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m112.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

{0: 'Better Touch Pad', 1: 'Battery Life', 2: 'Wrong Customer Service', 3: 'Purchased Warranty', 4: 'Powerful Graphics Card', 5: 'Ssd Storage', 6: 'Sound Is Great', 7: 'Issues Lag', 8: 'Graphics Card Crashes', 9: 'Deciding To Buy'}


## Aggregate Topics
Produces summary .CSV for Customer Service Team

In [7]:
df = pd.read_csv("with_topics.csv")

# mapping for this file (redundant I know, stay with me)
def star_to_sentiment(x):
    if x >= 4:
        return "positive"
    elif x <= 2:
        return "negative"
    return "neutral"

df["sentiment"] = df["stars"].apply(star_to_sentiment)

# aggregate counts for each topic
summary = (
    df.groupby("topic_id")["sentiment"]
      .value_counts()
      .unstack(fill_value=0)                 # columns: negative/neutral/positive
      .rename(columns={
          "negative": "n_negative",
          "neutral" : "n_neutral",
          "positive": "n_positive"
      })
      .assign(total=lambda t: t.sum(axis=1)) # add a total column
      .reset_index()
      .sort_values("n_negative", ascending=False)  # surface pain-points first
)

# save new csv
summary.to_csv("topic_sentiment_summary.csv", index=False)

# test print
print("\n", summary.head(10))


 sentiment                topic_id  n_negative  n_neutral  n_positive  total
9          Wrong Customer Service         387         44          67    498
0                    Battery Life         151         54         279    484
6              Purchased Warranty         125         43          49    217
1                Better Touch Pad          97         49         330    476
3           Graphics Card Crashes          96         25          88    209
2                 Deciding To Buy          73         25         144    242
5          Powerful Graphics Card          26         15         310    351
7                  Sound Is Great          25         29         332    386
8                     Ssd Storage          16         23         293    332
4                      Issues Lag           7          5         277    289


## Sentiment Analysis Training

This next task involves finetuning BeRT for a sentiment analysis classification task on future unlabeled reviews from a wide variety of sources. It will output a negative, neutral, positive given a review.


2.0 Env

In [8]:
!pip install -q --upgrade "transformers>=4.42.0" "accelerate>=0.29.0" \
                            "peft>=0.10.0" "datasets>=2.19.0" evaluate

import torch, pandas as pd
print("CUDA:", torch.cuda.is_available(), "| GPU:", torch.cuda.get_device_name(0))

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/365.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does n

2.1 Load and Encode

In [9]:
df = pd.read_csv("clean_reviews.csv")          # clean_text | sentiment | stars
label2id = {"negative":0, "neutral":1, "positive":2}
df["label"] = df["sentiment"].map(label2id)

# Handle Class imbalance
class_counts = df["label"].value_counts().sort_index()   # 0,1,2 order
print(class_counts)   # negative / neutral / positive

weights = 1.0 / torch.tensor(class_counts, dtype=torch.float)
weights = weights / weights.sum() * len(class_counts)    # mean≈1
print("Class weights:", weights)

label
0    1003
1     312
2    2169
Name: count, dtype: int64
Class weights: tensor([0.6414, 2.0620, 0.2966])


2.2 Tokenizer etc.

In [10]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def encode(batch):
    return tok(batch["clean_text"],
               truncation=True, padding="max_length", max_length=128)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Stratified 5-Fold CV wigth class weighted loss

In [11]:
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from transformers import (AutoModelForSequenceClassification, Trainer,
                          TrainingArguments)
from evaluate import load as load_metric
from torch.nn import functional as F
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

skf     = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
metric_acc = load_metric("accuracy")
metric_f1w = load_metric("f1")
metric_f1m = load_metric("f1")

fold_results = []

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.cross_entropy(outputs.logits, labels,
                               weight=weights.to(model.device))
        return (loss, outputs) if return_outputs else loss

for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["label"]), 1):
    print(f"\n—— Fold {fold} ——")
    train_ds = Dataset.from_pandas(df.iloc[train_idx][["clean_text","label"]]).map(encode, batched=True)
    val_ds   = Dataset.from_pandas(df.iloc[val_idx][["clean_text","label"]]).map(encode, batched=True)

    base = AutoModelForSequenceClassification.from_pretrained(
                "distilbert-base-uncased", num_labels=3)
    base = prepare_model_for_kbit_training(base, use_gradient_checkpointing=True)
    model = get_peft_model(base, LoraConfig(
                r=16, lora_alpha=32, lora_dropout=0.05,
                bias="none", target_modules=["q_lin","v_lin"],
                task_type="SEQ_CLS"))

    args = TrainingArguments(
        output_dir=f"chk_fold{fold}",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        eval_strategy="epoch",
        save_strategy="no",
        fp16=True,
        report_to="none"
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(-1)
        return {
            "accuracy": metric_acc.compute(predictions=preds, references=labels)["accuracy"],
            "f1_weighted": metric_f1w.compute(predictions=preds, references=labels,
                                              average="weighted")["f1"],
            "f1_macro":    metric_f1m.compute(predictions=preds, references=labels,
                                              average="macro")["f1"]
        }

    trainer = WeightedLossTrainer(model=model, args=args,
                      train_dataset=train_ds,
                      eval_dataset=val_ds,
                      compute_metrics=compute_metrics)
    trainer.train()
    res = trainer.evaluate()
    fold_results.append(res)
    print(res)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]


—— Fold 1 ——


Map:   0%|          | 0/2787 [00:00<?, ? examples/s]

Map:   0%|          | 0/697 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.775859,0.740316,0.754865,0.595248
2,No log,0.751992,0.76901,0.78231,0.6291
3,0.827100,0.739947,0.730273,0.761423,0.618458


{'eval_loss': 0.7399469017982483, 'eval_accuracy': 0.7302725968436155, 'eval_f1_weighted': 0.7614234416365365, 'eval_f1_macro': 0.6184582469844724, 'eval_runtime': 0.7934, 'eval_samples_per_second': 878.502, 'eval_steps_per_second': 55.458, 'epoch': 3.0}

—— Fold 2 ——


Map:   0%|          | 0/2787 [00:00<?, ? examples/s]

Map:   0%|          | 0/697 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.756354,0.784792,0.779449,0.608768
2,No log,0.724847,0.756098,0.77918,0.627296
3,0.819200,0.717857,0.757532,0.78007,0.627421


{'eval_loss': 0.7178574204444885, 'eval_accuracy': 0.757532281205165, 'eval_f1_weighted': 0.7800695821005825, 'eval_f1_macro': 0.6274205962680126, 'eval_runtime': 0.8051, 'eval_samples_per_second': 865.74, 'eval_steps_per_second': 54.652, 'epoch': 3.0}

—— Fold 3 ——


Map:   0%|          | 0/2787 [00:00<?, ? examples/s]

Map:   0%|          | 0/697 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.781451,0.76901,0.758645,0.569262
2,No log,0.743047,0.760402,0.778582,0.625732
3,0.823400,0.734718,0.758967,0.783183,0.644579


{'eval_loss': 0.7347184419631958, 'eval_accuracy': 0.7589670014347202, 'eval_f1_weighted': 0.7831833951047964, 'eval_f1_macro': 0.644578907218726, 'eval_runtime': 0.8097, 'eval_samples_per_second': 860.855, 'eval_steps_per_second': 54.344, 'epoch': 3.0}

—— Fold 4 ——


Map:   0%|          | 0/2787 [00:00<?, ? examples/s]

Map:   0%|          | 0/697 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.759333,0.737446,0.76999,0.617661
2,No log,0.729102,0.746055,0.766965,0.620621
3,0.825100,0.7109,0.751793,0.779164,0.627407


{'eval_loss': 0.7108997702598572, 'eval_accuracy': 0.7517934002869441, 'eval_f1_weighted': 0.7791642322090486, 'eval_f1_macro': 0.6274068882985291, 'eval_runtime': 0.8125, 'eval_samples_per_second': 857.856, 'eval_steps_per_second': 54.154, 'epoch': 3.0}

—— Fold 5 ——


Map:   0%|          | 0/2788 [00:00<?, ? examples/s]

Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted,F1 Macro
1,No log,0.77547,0.748563,0.764668,0.595391
2,No log,0.758891,0.731322,0.753169,0.580582
3,0.817400,0.748137,0.74569,0.772871,0.61171


{'eval_loss': 0.7481369972229004, 'eval_accuracy': 0.7456896551724138, 'eval_f1_weighted': 0.7728711244703248, 'eval_f1_macro': 0.6117102078454736, 'eval_runtime': 0.8358, 'eval_samples_per_second': 832.74, 'eval_steps_per_second': 52.644, 'epoch': 3.0}


Summary

In [12]:
cv_df = pd.DataFrame(fold_results)
print("\nMean across folds:")
print(cv_df[["eval_accuracy","eval_f1_weighted","eval_f1_macro"]].mean())


Mean across folds:
eval_accuracy       0.748851
eval_f1_weighted    0.775342
eval_f1_macro       0.625915
dtype: float64


Final Training on all data

In [13]:
from transformers import Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import torch
from torch.nn import functional as F

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss = F.cross_entropy(outputs.logits, labels,
                               weight=weights.to(model.device))
        return (loss, outputs) if return_outputs else loss

full_ds = Dataset.from_pandas(df[["clean_text","label"]]).map(encode, batched=True)

base = AutoModelForSequenceClassification.from_pretrained(
            "distilbert-base-uncased", num_labels=3)
base = prepare_model_for_kbit_training(base, use_gradient_checkpointing=True)
model = get_peft_model(base, LoraConfig(
            r=16, lora_alpha=32, lora_dropout=0.05,
            bias="none", target_modules=["q_lin","v_lin"],
            task_type="SEQ_CLS"))


trainer = WeightedLossTrainer(
    model=model,
    args=TrainingArguments(
        output_dir="sentiment_final",
        per_device_train_batch_size=16,
        num_train_epochs=3,
        fp16=True,
        save_total_limit=1,
        report_to="none"
    ),
    train_dataset=full_ds
)
trainer.train()
model.save_pretrained("sentiment_final")
tok.save_pretrained("sentiment_final")
print("Final model saved to sentiment_final/")

Map:   0%|          | 0/3484 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
500,0.8119


Final model saved to sentiment_final/


## Demo

In [15]:
import torch, pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from peft import PeftModel

MODEL_DIR = "sentiment_final"
BASE_CKPT = "distilbert-base-uncased"  # same backbone used during training
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"

# load tokenzier
tok = AutoTokenizer.from_pretrained(BASE_CKPT)

# Frozen base model with 3 labels
base_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_CKPT,
    num_labels=3
).to(DEVICE)

# LoRA adapter
model = PeftModel.from_pretrained(base_model, MODEL_DIR).to(DEVICE)
model.eval()

id2label = {0: "negative", 1: "neutral", 2: "positive"}

def predict_sent(texts):
    enc = tok(texts, padding=True, truncation=True, max_length=128,
              return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        logits = model(**enc).logits
    preds = logits.argmax(-1).cpu().tolist()
    return [id2label[p] for p in preds]

# Demo
sample_reviews = [
    "Battery dies after 40 minutes – extremely disappointed.",
    "Works fine for everyday tasks. Nothing special, nothing terrible.",
    "Absolutely love the performance and build quality! 10/10 purchase.",
    "Keyboard is okay, but the fan noise gets a bit loud under load.",
    "Arrived DOA: black screen and constant beeping. Had to return it."
]

results = predict_sent(sample_reviews)
pd.DataFrame({"review_text": sample_reviews,
              "predicted_sentiment": results})


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,review_text,predicted_sentiment
0,Battery dies after 40 minutes – extremely disa...,negative
1,Works fine for everyday tasks. Nothing special...,positive
2,Absolutely love the performance and build qual...,positive
3,"Keyboard is okay, but the fan noise gets a bit...",neutral
4,Arrived DOA: black screen and constant beeping...,negative
