In [None]:
!pip install evaluate

In [2]:
import evaluate
import os
import wandb
from datasets import load_dataset, concatenate_datasets
import torch
import math

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    GPT2ForSequenceClassification,
    Trainer, 
    TrainingArguments
)

2025-05-05 20:04:27.675539: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746475468.147716      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746475468.290876      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Preprocessing data

### Load dataset

In [3]:
seed = 42

dataset = load_dataset("imdb")

train_dataset = dataset["train"]
positive = train_dataset.filter(lambda x: x["label"] == 1)
negative = train_dataset.filter(lambda x: x["label"] == 0)

subset_pos = positive.shuffle(seed=seed).select(range(6250))
subset_neg = negative.shuffle(seed=seed).select(range(6250))

balanced_subset = concatenate_datasets([subset_pos, subset_neg]).shuffle(seed=seed)
dataset["train"] = balanced_subset

dataset

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12500
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [4]:
dataset["train"][0]

{'text': 'From around the time Europe began fighting World War II, until the war\'s end, Hollywood (with significant prodding from the government) made tons of movies which were designed to try and get young men to enlist in the Army, by making the life of a serviceman appear "cool." This is by far the sloppiest, implying that the life of a soldier is devoid of work, you get the best food, and you get to lie around all day listening to Ann Miller on the radio. I am far too young to have participated in WWII, but I think that there was more to it than that. There is the barest cat\'s whisker of a plot, and a bunch of musical numbers featuring some of the day\'s leading acts.<br /><br />I think that by 1943, even the most naive of civvies knew that there was more going on overseas than the wacky hijinks portrayed in this movie. While I am sure that it was meant to be viewed as escapist entertainment, I can\'t help but wonder if the family and loved ones of men fighting in the war, were a

### Tokenizer

In [5]:
model_name = "linhlinhle997/gpt2-small-c4-pretrained"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.60M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/692 [00:00<?, ?B/s]

In [6]:
def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)

tokenized_ds = dataset.map(tokenize, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds.set_format("torch")

tokenized_ds

Map:   0%|          | 0/12500 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 12500
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
})

## Load config

In [7]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {v: k for k, v in id2label.items()}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2, id2label=id2label, label2id=label2id
)
model.config.pad_token_id = tokenizer.pad_token_id

config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/180M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at linhlinhle997/gpt2-small-c4-pretrained and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 512)
    (wpe): Embedding(512, 512)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=1536, nx=512)
          (c_proj): Conv1D(nf=512, nx=512)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=2048, nx=512)
          (c_proj): Conv1D(nf=512, nx=2048)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=512, out_features=2, bias=False)
)

## Training

In [9]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return accuracy.compute(predictions=preds, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
project_name = "gpt2-c4-imdb"
run_name = "finetuning-gpt2-small-imdb-v1"
model_name = "gpt2-small-imdb-finetuning"

os.environ["WANDB_API_KEY"] = ""
os.environ["WANDB_PROJECT"] = project_name

In [12]:
training_args = TrainingArguments(
    output_dir=model_name,
    run_name=run_name,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    save_total_limit=1,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mlinhlinh-le997[0m ([33mlinhlinh-le997-prime-labo[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.




Epoch,Training Loss,Validation Loss,Accuracy
1,0.5232,0.493288,0.7774
2,0.3271,0.445221,0.82648
3,0.1825,0.598824,0.78908
4,0.1145,0.565549,0.81508
5,0.0712,1.641583,0.7924
6,0.1061,2.280513,0.81292
7,0.0987,2.93557,0.81048
8,0.055,3.172876,0.80256
9,0.0323,3.190399,0.80332
10,0.0146,3.099252,0.80768




TrainOutput(global_step=1960, training_loss=0.1525110095131154, metrics={'train_runtime': 1739.2616, 'train_samples_per_second': 71.87, 'train_steps_per_second': 1.127, 'total_flos': 3631939584000000.0, 'train_loss': 0.1525110095131154, 'epoch': 10.0})

In [13]:
tokenizer.save_pretrained(model_name)
model.save_pretrained(model_name)

## Inference

In [15]:
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
text = "This movie was absolutely terrible and boring."

inputs = tokenizer(text, return_tensors="pt").to(model.device)

model.eval()
with torch.no_grad():
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1).item()

result = "positive" if pred == 1 else "negative"
result

'negative'