In [10]:
import sys
import os
from pathlib import Path

project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

In [2]:
import torch
from torch.utils.data import Dataset

print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

CUDA available: True
GPU name: NVIDIA GeForce RTX 4060 Laptop GPU


In [3]:
from src.data_loader import load_data
from transformers import DistilBertTokenizerFast

X_train, X_test, y_train, y_test = load_data()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

def tokenize(texts):
    return tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=256
    )

train_enc = tokenize(X_train.tolist())
test_enc = tokenize(X_test.tolist())


In [5]:
class BERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_ds = BERTDataset(train_enc, y_train.tolist())
test_ds = BERTDataset(test_enc, y_test.tolist())


In [6]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [8]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="./bert_results",

    # âœ… legacy-compatible evaluation
    do_train=True,
    do_eval=True,
    eval_steps=500,
    save_steps=500,

    # training params
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,

    # performance
    fp16=True,
    logging_steps=100,

    save_total_limit=1,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics
)

trainer.train()


metrics = trainer.evaluate()
bert_acc = metrics["eval_accuracy"]
print(f"BERT Test Accuracy: {bert_acc:.4f}")

Step,Training Loss
100,0.4593
200,0.3215
300,0.3044
400,0.2939
500,0.2969
600,0.2709
700,0.2827
800,0.2871
900,0.2713
1000,0.2629


BERT Test Accuracy: 0.9244


In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

model.eval()
preds, labels = [], []

with torch.no_grad():
    for batch in test_ds:
        inputs = {k: v.unsqueeze(0).to(device) for k, v in batch.items() if k != "labels"}
        label = batch["labels"].item()

        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()

        preds.append(pred)
        labels.append(label)

acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds)

print(f"DistilBERT Accuracy: {acc:.4f}")
print(f"DistilBERT F1: {f1:.4f}")


In [11]:
model_path = os.path.join(project_root, "models", "final_distilbert_model")
tokenizer.save_pretrained(model_path)

('C:\\jack\\study\\e2eprojects\\movie_sentiment_analysis\\models\\final_distilbert_model\\tokenizer_config.json',
 'C:\\jack\\study\\e2eprojects\\movie_sentiment_analysis\\models\\final_distilbert_model\\special_tokens_map.json',
 'C:\\jack\\study\\e2eprojects\\movie_sentiment_analysis\\models\\final_distilbert_model\\vocab.txt',
 'C:\\jack\\study\\e2eprojects\\movie_sentiment_analysis\\models\\final_distilbert_model\\added_tokens.json',
 'C:\\jack\\study\\e2eprojects\\movie_sentiment_analysis\\models\\final_distilbert_model\\tokenizer.json')