# Fine-Tuning DistilBERT
The goal is to Fine-tune DistilBERT to predict sentiment on the Twitter dataset.

## About Dataset
#### Context
This is the sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .

#### Content
It contains the following 6 fields:

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

ids: The id of the tweet ( 2087)

date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

flag: The query (lyx). If there is no query, then this value is NO_QUERY.

user: the user that tweeted (robotickilldozr)

text: the text of the tweet (Lyx is cool)

#### Acknowledgements
The official link regarding the dataset with resources about how it was generated is here
The official paper detailing the approach is here

#### Citation: 
Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(2009), p.12.



In [12]:
print("Train dataset columns:", train_dataset.column_names)
print("Test dataset columns:", test_dataset.column_names)
print("First train example:", train_dataset[0])

Train dataset columns: ['labels', 'input_ids', 'attention_mask']
Test dataset columns: ['labels', 'input_ids', 'attention_mask']
First train example: {'labels': 1, 'input_ids': [101, 2074, 2513, 2013, 7873, 2777, 1012, 2986, 2396, 2265, 1010, 13366, 16294, 4221, 2135, 1037, 3459, 2846, 1997, 2147, 1010, 4169, 1999, 3327, 2001, 6581, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [14]:
from datasets import Dataset, Features, ClassLabel, Value
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

# Load and preprocess data
path = "c:\\Users\\Alex Chung\\Documents\\the_Lab\\Portfolio\\ml_engineering\\data\\sentiment140\\"
file = "training.1600000.processed.noemoticon.csv"
df = pd.read_csv(path + file, 
                 encoding="ISO-8859-1", names=["target", "id", "date", "flag", "user", "text"])
df = df[["target", "text"]].sample(10000, random_state=42)
df["target"] = df["target"].map({0: 0, 4: 1})
df = df.reset_index(drop=True)

# Define dataset features
features = Features({
    "target": ClassLabel(names=["negative", "positive"]),
    "text": Value(dtype="string")
})
dataset = Dataset.from_pandas(df, features=features)

# Check original distribution
print("Original Label Distribution:")
print(df["target"].value_counts(normalize=True))

# Tokenize, preserving labels
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=False)
tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Rename target to labels
tokenized_dataset = tokenized_dataset.rename_column("target", "labels")

# Verify dataset columns
print("Tokenized dataset columns:", tokenized_dataset.column_names)

# Stratified split
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column="labels")
train_dataset = train_test["train"]
test_dataset = train_test["test"]

# Verify split and columns
print(f"\nTrain size: {len(train_dataset)}, Test size: {len(test_dataset)}")
print("Train dataset columns:", train_dataset.column_names)
print("Test dataset columns:", test_dataset.column_names)
train_dist = pd.Series(train_dataset["labels"]).value_counts(normalize=True)
test_dist = pd.Series(test_dataset["labels"]).value_counts(normalize=True)
print("Train Label Distribution:")
print(train_dist)
print("Test Label Distribution:")
print(test_dist)

# Inspect tokenized dataset
print("\nFirst Train Example:")
print(train_dataset[0])

# Set up Trainer
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    output_dir="C:/Users/Alex Chung/Documents/ml_engineering_clean/results",
    logging_dir="C:/Users/Alex Chung/Documents/ml_engineering_clean/logs",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",  # Changed from eval_strategy
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_steps=100,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    compute_metrics=lambda eval_pred: {
        "accuracy": (np.argmax(eval_pred.predictions, axis=1) == eval_pred.label_ids).mean()
    }
)

# Train the model
trainer.train()

Original Label Distribution:
target
0    0.5004
1    0.4996
Name: proportion, dtype: float64




Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenized dataset columns: ['labels', 'input_ids', 'attention_mask']

Train size: 8000, Test size: 2000
Train dataset columns: ['labels', 'input_ids', 'attention_mask']
Test dataset columns: ['labels', 'input_ids', 'attention_mask']
Train Label Distribution:
0    0.500375
1    0.499625
Name: proportion, dtype: float64
Test Label Distribution:
0    0.5005
1    0.4995
Name: proportion, dtype: float64

First Train Example:
{'labels': 1, 'input_ids': [101, 2074, 2513, 2013, 7873, 2777, 1012, 2986, 2396, 2265, 1010, 13366, 16294, 4221, 2135, 1037, 3459, 2846, 1997, 2147, 1010, 4169, 1999, 3327, 2001, 6581, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4269,0.444462,0.793
2,0.2867,0.489853,0.8075
3,0.129,0.739199,0.8025


TrainOutput(global_step=1500, training_loss=0.30671591504414875, metrics={'train_runtime': 4676.5072, 'train_samples_per_second': 5.132, 'train_steps_per_second': 0.321, 'total_flos': 267277814425728.0, 'train_loss': 0.30671591504414875, 'epoch': 3.0})

In [15]:
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)
trainer.save_model("C:/Users/Alex Chung/Documents/ml_engineering_clean/final_model")

Evaluation Results: {'eval_loss': 0.4444619417190552, 'eval_accuracy': 0.793, 'eval_runtime': 114.8718, 'eval_samples_per_second': 17.411, 'eval_steps_per_second': 1.088, 'epoch': 3.0}


## 1. Loading and Inspecting Data

In [None]:
path = "c:\\Users\\Alex Chung\\Documents\\the_Lab\\Portfolio\\ml_engineering\\data\\sentiment140\\"
df = pd.read_csv(path+"training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", names=["target", "id", "date", "flag", "user", "text"])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.target.value_counts()

In [None]:
df.text.to_list()[:2]

## 2. Preprocessing

In [None]:
# Load subset of Twitter data
df = df[["target", "text"]].sample(10000, random_state=42)  # Subset for speed
df["target"] = df["target"].map({0: 0, 4: 1})  # Map labels
df = df.reset_index(drop=True)  # Reset index to avoid __index_level_0__

In [None]:
df.head()

In [None]:
# Define dataset features with ClassLabel for target
features = Features({
    "target": ClassLabel(names=["negative", "positive"]),  # Define 0=negative, 1=positive
    "text": Value("string")
})
dataset = Dataset.from_pandas(df, features=features)

In [None]:
# Tokenize
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

### Checking the tokenized dataset

In [None]:
# 1. Dataset overview
print("Dataset Info:")
print(tokenized_dataset)
print("Columns:", tokenized_dataset.column_names)

In [None]:
# 2. Single example
print("First Example:")
print(tokenized_dataset[0])

In [None]:
# 3. Multiple examples as table
print("First 5 Examples:")
df_tokenized = tokenized_dataset.select(range(5)).to_pandas()
df_tokenized[['text', 'target', 'input_ids', 'attention_mask']]

In [None]:
# 4. Decode tokens
print("Decoded Example:")
sample = tokenized_dataset[0]
decoded_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
print(f"Original: {sample['text']}")
print(f"Decoded: {decoded_text}")

In [None]:
# 5. Verify lengths
lengths = [len(sample['input_ids']) for sample in tokenized_dataset]
print(f"\nAll lengths 512? {all(length == 512 for length in lengths)}")

In [None]:
# 6. Check labels
unique_labels = set(tokenized_dataset['target'])
print(f"Labels: {unique_labels}")

In [None]:
# 7. Inspect attention mask
print("Attention Mask Example:")
token_count = sum(sample['attention_mask'])
print(f"Non-padding tokens: {token_count}")
print(f"First 10 input_ids: {sample['input_ids'][:10]}")
print(f"First 10 attention_mask: {sample['attention_mask'][:10]}")

### Splitting the tokenized dataset into stratefied train test set

In [None]:
# Stratified train/test split
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column="target")
train_dataset = train_test["train"]
test_dataset = train_test["test"]

# Verify sizes
print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")

In [None]:
# Verify split balance
print(f"\nTrain size: {len(train_dataset)}, Test size: {len(test_dataset)}")
train_dist = pd.Series(train_dataset["target"]).value_counts(normalize=True)
test_dist = pd.Series(test_dataset["target"]).value_counts(normalize=True)
print("Train Label Distribution:")
print(train_dist)
print("Test Label Distribution:")
print(test_dist)

In [None]:
# Verify sequence lengths
train_lengths = [len(sample['input_ids']) for sample in train_dataset]
test_lengths = [len(sample['input_ids']) for sample in test_dataset]
print(f"\nTrain lengths 512? {all(length == 512 for length in train_lengths)}")
print(f"Test lengths 512? {all(length == 512 for length in test_lengths)}")

In [None]:
Dataset

In [None]:
# Split the data into train and test set
train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['target'])

print(f"New stratefied dataframe shapes: train is {train.shape}, train is {test.shape}")

In [None]:
print("Train target counts:")
train.target.value_counts()

In [None]:
print("Test target counts:")
test.target.value_counts()

## 3. Model Training

In [None]:
from datasets import Dataset, Features, ClassLabel, Value
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments

path = "c:\\Users\\Alex Chung\\Documents\\the_Lab\\Portfolio\\ml_engineering\\data\\sentiment140\\"

# Load and preprocess data
df = pd.read_csv(path+"training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", names=["target", "id", "date", "flag", "user", "text"])
df = df[["target", "text"]].sample(10000, random_state=42)
df["target"] = df["target"].map({0: 0, 4: 1})
df = df.reset_index(drop=True)  # Avoid __index_level_0__

# Define dataset features
features = Features({
    "target": ClassLabel(names=["negative", "positive"]),
    "text": Value(dtype="string")
})
dataset = Dataset.from_pandas(df, features=features)

# Check original distribution
print("Original Label Distribution:")
print(df["target"].value_counts(normalize=True))

# Tokenize
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Stratified split
train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column="target")
train_dataset = train_test["train"]
test_dataset = train_test["test"]

# Verify split
print(f"\nTrain size: {len(train_dataset)}, Test size: {len(test_dataset)}")
train_dist = pd.Series(train_dataset["target"]).value_counts(normalize=True)
test_dist = pd.Series(test_dataset["target"]).value_counts(normalize=True)
print("Train Label Distribution:")
print(train_dist)
print("Test Label Distribution:")
print(test_dist)

# Inspect tokenized dataset
print("\nFirst Train Example:")
print(train_dataset[0])
sample = train_dataset[0]
decoded_text = tokenizer.decode(sample['input_ids'], skip_special_tokens=True)
print(f"Original: {sample['text']}")
print(f"Decoded: {decoded_text}")
train_lengths = [len(sample['input_ids']) for sample in train_dataset]
print(f"Train lengths 512? {all(length == 512 for length in train_lengths)}")

# Set up Trainer
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=100,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda eval_pred: {
        "accuracy": (eval_pred.predictions.argmax(axis=1) == eval_pred.label_ids).mean()
    }
)

# Train the model
trainer.train()

In [None]:
from transformers import DistilBertForSequenceClassification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
print("Model loaded successfully")

In [None]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()

In [None]:
from transformers import pipeline

In [None]:
classifier = pipeline("sentiment-analysis")

In [None]:
classifier("I've been waiting for a HuggingFace course my whole life.")

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
classifier("I've been waiting for a HuggingFace course my whole life.")

In [None]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")
classifier(df.text.to_list()[:2])

## 4. Evaluation

## 1. Loading Data