RUN THIS NOTEBOOK TO FINETUNING WITH BERT

In [5]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset

In [None]:
# Load CSV (Sentiment140)
df = pd.read_csv("dataset/training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
df = df[[0, 5]]  # Keep only sentiment and text
df.columns = ["label", "text"]

# Map: 0 = negative, 2 = neutral, 4 = positive
df = df[df['label'].isin([0, 2, 4])]
df['label'] = df['label'].map({0: 0, 2: 1, 4: 2})

# Optional: Reduce size for speed
df = df.sample(n=50000, random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,label,text
0,0,@chrishasboobs AHHH I HOPE YOUR OK!!!
1,0,"@misstoriblack cool , i have no tweet apps fo..."
2,0,@TiannaChaos i know just family drama. its la...
3,0,School email won't open and I have geography ...
4,0,upper airways problem


In [7]:
# Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.1
)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
class TweetDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

    def __len__(self):
        return len(self.labels)

# Wrap datasets
train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)


In [9]:
# Load pre-trained BERT for classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define training args
training_args = TrainingArguments(
    output_dir="./results",                 # Where to save checkpoints
    fp16=True,                              # Enable mixed precision
    num_train_epochs=2,                     # Number of epochs
    per_device_train_batch_size=16,         # Training batch size
    per_device_eval_batch_size=32,          # Eval batch size
    eval_strategy="epoch",            # Evaluate every epoch
    save_strategy="epoch",                  # Save every epoch
    logging_dir="./logs",                   # Directory for logs
    logging_steps=10,                       # Log every 10 steps
    logging_strategy="steps",               # Ensure logging is step-based
    report_to="none",                       # Prevent WandB logging (optional)
    load_best_model_at_end=True,            # Reload best model at the end
    metric_for_best_model="eval_loss",      # Use eval loss to determine best
    greater_is_better=False                 # Lower eval loss is better
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


Epoch,Training Loss,Validation Loss
1,0.433,0.37632
2,0.2841,0.436147


TrainOutput(global_step=5626, training_loss=0.3310575339237312, metrics={'train_runtime': 744.7, 'train_samples_per_second': 120.854, 'train_steps_per_second': 7.555, 'total_flos': 5920051898880000.0, 'train_loss': 0.3310575339237312, 'epoch': 2.0})

In [11]:
save_path = "model/bert_sentiment_model"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")


Model saved to model/bert_sentiment_model
