In [1]:
# !huggingface-cli login

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import Dataset
from transformers import TrainingArguments, Trainer
import optuna 


## Tokenizaiton is the practice of breaking sentences into individual words while considering punctuations within the sentence. It is able to break long words into prefix, roots, suffix, and etc. to maximize the ability for computer to recognize rare words by analyzing its Morphological meaning

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
text = "Hello, world! Isn't it amazing how language works? Consider this: Mr. O'Connor—who famously said, 'Language, in all its messy glory, defines our reality!'—often challenged conventional grammar rules. In fact, when you read: 'Wait—what? Really?!' you notice that punctuation, quotes, dashes, and exclamation points all serve specific roles. (Yes, even parentheses add context.) Tokenization helps break down such complex sentences into individual, manageable tokens for further processing."
tokens = tokenizer.tokenize(text)
print(tokens)

['hello', ',', 'world', '!', 'isn', "'", 't', 'it', 'amazing', 'how', 'language', 'works', '?', 'consider', 'this', ':', 'mr', '.', 'o', "'", 'connor', '—', 'who', 'famously', 'said', ',', "'", 'language', ',', 'in', 'all', 'its', 'messy', 'glory', ',', 'defines', 'our', 'reality', '!', "'", '—', 'often', 'challenged', 'conventional', 'grammar', 'rules', '.', 'in', 'fact', ',', 'when', 'you', 'read', ':', "'", 'wait', '—', 'what', '?', 'really', '?', '!', "'", 'you', 'notice', 'that', 'pun', '##ct', '##uation', ',', 'quotes', ',', 'dash', '##es', ',', 'and', 'ex', '##cl', '##ama', '##tion', 'points', 'all', 'serve', 'specific', 'roles', '.', '(', 'yes', ',', 'even', 'parentheses', 'add', 'context', '.', ')', 'token', '##ization', 'helps', 'break', 'down', 'such', 'complex', 'sentences', 'into', 'individual', ',', 'manage', '##able', 'token', '##s', 'for', 'further', 'processing', '.']


# Load Corpus Data

In [4]:
splits = {'train': 'data/train-00000-of-00001-ad33ea7d240dcb80.parquet', 'validation': 'data/validation-00000-of-00001-a108f2216fa73659.parquet', 'test': 'data/test-00000-of-00001-9696555e053ff5e2.parquet'}
corpus_train = pd.read_parquet("hf://datasets/Shayanvsf/US_Airline_Sentiment/" + splits["train"])
corpus_val = pd.read_parquet("hf://datasets/Shayanvsf/US_Airline_Sentiment/" + splits["validation"])
corpus_test = pd.read_parquet("hf://datasets/Shayanvsf/US_Airline_Sentiment/" + splits["test"])

airline_sentiment:
This column likely represents a normalized sentiment score for each tweet. For example, a value closer to 1 might indicate positive sentiment, while a value closer to 0 might indicate negative sentiment. In some cases, it could also be interpreted as the probability that the tweet is positive (or negative), depending on how the model is designed.

airline_sentiment_confidence:
This column indicates the model's confidence in its sentiment prediction. A value near 1 means the model is very sure about the sentiment (whether positive or negative) it has assigned, while a value closer to 0 means the prediction is less certain.

negative_reason_confidence:
This column is likely relevant only when a tweet is classified as negative. It measures how confident the model is in identifying a specific reason for the negative sentiment (such as delays, poor service, etc.). Again, a value closer to 1 indicates high confidence that the reason it selected accurately explains the negativity expressed in the tweet.

In [5]:
# convert all text into lower case characters 
corpus_train['text'] = corpus_train['text'].apply(lambda x: x.lower())
corpus_val['text'] = corpus_val['text'].apply(lambda x: x.lower())
corpus_test['text'] = corpus_test['text'].apply(lambda x: x.lower())

In [6]:
corpus_train

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason_confidence,text
0,0,1.0000,1.0000,@united thanks for causing us to miss our conn...
1,1,1.0000,,@united thank you. any help is appreciated.
2,0,0.7179,0.7179,@usairways i ask for reimbursement maybe miles...
3,0,1.0000,0.6558,@united annnnddddd i'm going to lose my first ...
4,0,1.0000,0.6654,"@americanair 4285. apparently we’re told, staf..."
...,...,...,...,...
8073,0,1.0000,1.0000,@americanair my ex-boyfriend picks up my calls...
8074,0,1.0000,1.0000,@united i send an email about my bad experienc...
8075,0,1.0000,0.6448,@virginamerica you are failing your customers ...
8076,0,1.0000,0.6649,@americanair your rubbish at social media! in ...


In [7]:
# load pre-trained model
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# tokenize training text
corpus_train['text'] = corpus_train['text'].apply(lambda x: tokenizer(x, truncation=True))

In [8]:
# impute missing values in negativereason_confidence variable
corpus_train['negativereason_confidence'] = corpus_train['negativereason_confidence'].fillna(0)
corpus_val['negativereason_confidence'] = corpus_val['negativereason_confidence'].fillna(0)
corpus_test['negativereason_confidence'] = corpus_test['negativereason_confidence'].fillna(0)

In [9]:
corpus_train

Unnamed: 0,airline_sentiment,airline_sentiment_confidence,negativereason_confidence,text
0,0,1.0000,1.0000,"[input_ids, attention_mask]"
1,1,1.0000,0.0000,"[input_ids, attention_mask]"
2,0,0.7179,0.7179,"[input_ids, attention_mask]"
3,0,1.0000,0.6558,"[input_ids, attention_mask]"
4,0,1.0000,0.6654,"[input_ids, attention_mask]"
...,...,...,...,...
8073,0,1.0000,1.0000,"[input_ids, attention_mask]"
8074,0,1.0000,1.0000,"[input_ids, attention_mask]"
8075,0,1.0000,0.6448,"[input_ids, attention_mask]"
8076,0,1.0000,0.6649,"[input_ids, attention_mask]"


In [10]:
# Load pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Sentiment Analysis using pre-trained Bert-based model

In [11]:
# tokenize testing text
test_input = tokenizer(list(corpus_test['text']), padding=True, truncation=True, return_tensors="pt")

with torch.no_grad():
    outputs = model(**test_input)
    logits = outputs.logits  # shape: (batch_size, num_labels)

# Convert logits to probabilities
probabilities = torch.softmax(logits, dim=-1)
# Assume index 0 is negative and 1 is positive
predicted_labels = torch.argmax(probabilities, dim=-1).numpy()
# Confidence can be taken as the probability of the predicted label
predicted_confidences = probabilities.max(dim=-1).values.numpy()
# true label from corpus_test
test_true = corpus_test['airline_sentiment']

initial_result = pd.DataFrame({
    'test_pred': predicted_labels,
    'test_true': corpus_test['airline_sentiment'],
    'predicted_confidences': predicted_confidences
})

print(f'accuracy score: {accuracy_score(test_true, predicted_labels)}')
print(f'precision score: {precision_score(test_true, predicted_labels)}')
print(f'recall score: {recall_score(test_true, predicted_labels)}')
print(f'f1 score: {f1_score(test_true, predicted_labels)}')


initial_result

accuracy score: 0.7991341991341991
precision score: 0.0
recall score: 0.0
f1 score: 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,test_pred,test_true,predicted_confidences
0,0,0,0.525317
1,0,0,0.517752
2,0,0,0.532130
3,0,0,0.530058
4,0,0,0.519232
...,...,...,...
1150,0,1,0.518085
1151,0,1,0.518094
1152,0,0,0.531278
1153,0,0,0.532974


label
1 means good
0 means bad


# Fine-tune the Bert-based LLM 

## Pre-process the training and validating corpus

In [12]:
# Create new columns for input_ids and attention_mask for training data
corpus_train["input_ids"] = corpus_train["text"].apply(lambda x: x["input_ids"])
corpus_train["attention_mask"] = corpus_train["text"].apply(lambda x: x["attention_mask"])
corpus_train = corpus_train.rename(columns={"airline_sentiment": "labels"})

train_dataset = Dataset.from_pandas(corpus_train[['input_ids','attention_mask','labels']])
train_dataset

# Create new columns for input_ids and attention_mask for validation data
corpus_val['text'] = corpus_val['text'].apply(lambda x: tokenizer(x, truncation=True))

corpus_val["input_ids"] = corpus_val["text"].apply(lambda x: x["input_ids"])
corpus_val["attention_mask"] = corpus_val["text"].apply(lambda x: x["attention_mask"])
corpus_val = corpus_val.rename(columns={"airline_sentiment": "labels"})

eval_dataset = Dataset.from_pandas(corpus_val[['input_ids','attention_mask','labels']])
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2308
})

In [17]:
def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(model_checkpoint)

In [13]:
# training parameters
training_args = TrainingArguments(
    output_dir="./results",             
    eval_strategy="epoch",       
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=4,                
    weight_decay=0.02,
    logging_steps=50,
    save_total_limit=2,             
)

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
# metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions),
        "recall": recall_score(labels, predictions),
        "f1": f1_score(labels, predictions)
    }


In [19]:
trainer = Trainer(
    model=None,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    eval_dataset=eval_dataset,
    model_init = model_init
)

# Define the hyperparameter search space using an Optuna trial
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5,log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.1, step=0.02),
    }

best_run = trainer.hyperparameter_search(
    hp_space=hp_space,
    backend="optuna",
    n_trials=10,            # Adjust number of trials as needed
    direction="maximize"    # Here we maximize the evaluation accuracy
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-03 17:20:34,730] A new study created in memory with name: no-name-cc274afd-61ea-49f1-bebb-0b139cc28d81
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2011,0.160711,0.940641,0.887892,0.819876,0.85253
2,0.1164,0.189629,0.944541,0.858586,0.879917,0.869121
3,0.0422,0.280112,0.941941,0.924574,0.786749,0.850112
4,0.0132,0.26008,0.947574,0.878661,0.869565,0.874089


[I 2025-04-03 17:31:16,766] Trial 0 finished with value: 3.569889452217631 and parameters: {'learning_rate': 2.4460807789600356e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'weight_decay': 0.04}. Best is trial 0 with value: 3.569889452217631.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.19,0.158662,0.938042,0.877778,0.817805,0.846731
2,0.1513,0.172734,0.933276,0.906173,0.759834,0.826577
3,0.0976,0.166044,0.943241,0.891111,0.830228,0.859593
4,0.1077,0.170679,0.945841,0.88913,0.846791,0.867444


[I 2025-04-03 17:40:12,164] Trial 1 finished with value: 3.54920620626166 and parameters: {'learning_rate': 1.009878008407478e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.08}. Best is trial 0 with value: 3.569889452217631.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1884,0.172021,0.937175,0.891204,0.797101,0.84153
2,0.127,0.192146,0.944107,0.893333,0.832298,0.861736
3,0.0475,0.208719,0.946707,0.881356,0.861284,0.871204


[I 2025-04-03 17:48:36,473] Trial 2 finished with value: 3.560550870296642 and parameters: {'learning_rate': 1.954061681564862e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.06}. Best is trial 0 with value: 3.569889452217631.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1854,0.148546,0.941508,0.876623,0.838509,0.857143
2,0.1183,0.153738,0.948007,0.88535,0.863354,0.874214
3,0.0616,0.174107,0.948007,0.895425,0.850932,0.872611


[I 2025-04-03 17:55:31,816] Trial 3 finished with value: 3.566974910997106 and parameters: {'learning_rate': 2.099142871148337e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'weight_decay': 0.08}. Best is trial 0 with value: 3.569889452217631.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1897,0.164156,0.940208,0.900232,0.803313,0.849015
2,0.1147,0.198737,0.943241,0.846457,0.890269,0.86781
3,0.0273,0.264436,0.944541,0.904328,0.821946,0.861171
4,0.0039,0.270019,0.949307,0.887712,0.867495,0.877487


[I 2025-04-03 19:36:17,196] Trial 4 finished with value: 3.582000358516894 and parameters: {'learning_rate': 3.193442013911449e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 16, 'weight_decay': 0.1}. Best is trial 4 with value: 3.582000358516894.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1846,0.149433,0.939341,0.864119,0.84265,0.853249
2,0.1282,0.155086,0.942808,0.875803,0.846791,0.861053
3,0.0677,0.204242,0.941941,0.918465,0.792961,0.851111


[I 2025-04-03 19:43:03,921] Trial 5 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1953,0.161547,0.943674,0.886214,0.838509,0.861702
2,0.1136,0.187438,0.947574,0.873967,0.875776,0.874871
3,0.0417,0.212914,0.949307,0.886076,0.869565,0.877743


[I 2025-04-03 19:51:26,627] Trial 6 finished with value: 3.5826908725656437 and parameters: {'learning_rate': 2.3253091959105992e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.02}. Best is trial 6 with value: 3.5826908725656437.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.195,0.164064,0.942808,0.871036,0.853002,0.861925
2,0.1116,0.196816,0.946707,0.873444,0.871636,0.872539
3,0.0374,0.236653,0.944541,0.88172,0.848861,0.864979


[I 2025-04-03 20:00:06,004] Trial 7 finished with value: 3.540101344607952 and parameters: {'learning_rate': 2.9047684378642023e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.04}. Best is trial 6 with value: 3.5826908725656437.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1836,0.163962,0.940641,0.894977,0.811594,0.851249


[I 2025-04-03 21:41:54,380] Trial 8 pruned. 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2011,0.160644,0.940208,0.930175,0.772257,0.843891


[I 2025-04-03 21:44:44,810] Trial 9 pruned. 


In [20]:
print("Best hyperparameters:", best_run.hyperparameters)

Best hyperparameters: {'learning_rate': 2.3253091959105992e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.02}


In [21]:
# update parameters with the best parameters 
trainer.args.learning_rate = best_run.hyperparameters["learning_rate"]
trainer.args.num_train_epochs = best_run.hyperparameters["num_train_epochs"]
trainer.args.per_device_train_batch_size = best_run.hyperparameters["per_device_train_batch_size"]
trainer.args.weight_decay = best_run.hyperparameters["weight_decay"]

# re-train using best parameters
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.1952,0.161653,0.944541,0.89011,0.838509,0.863539
2,0.1136,0.187128,0.947574,0.873967,0.875776,0.874871
3,0.0417,0.212917,0.949307,0.887712,0.867495,0.877487


TrainOutput(global_step=1515, training_loss=0.13110034450052596, metrics={'train_runtime': 480.7381, 'train_samples_per_second': 50.41, 'train_steps_per_second': 3.151, 'total_flos': 280740848715696.0, 'train_loss': 0.13110034450052596, 'epoch': 3.0})

In [22]:
# Create new columns for input_ids and attention_mask for testing data
corpus_test['text'] = corpus_test['text'].apply(lambda x: tokenizer(x, truncation=True))

corpus_test["input_ids"] = corpus_test["text"].apply(lambda x: x["input_ids"])
corpus_test["attention_mask"] = corpus_test["text"].apply(lambda x: x["attention_mask"])
corpus_test = corpus_test.rename(columns={"airline_sentiment": "labels"})

test_dataset = Dataset.from_pandas(corpus_test[['input_ids','attention_mask']])
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 1155
})

In [23]:
# Get predictions as a numpy array
predictions_np = trainer.predict(test_dataset).predictions

# Convert the numpy array to a torch tensor and then apply softmax
predictions_tensor = torch.tensor(predictions_np)
probabilities = torch.softmax(predictions_tensor, dim=-1)
predicted_labels = torch.argmax(probabilities, dim=-1).numpy()
predicted_confidences = probabilities.max(dim=-1).values.numpy()

final_result = pd.DataFrame({
    'test_pred': predicted_labels,
    'test_true': corpus_test['labels'],
    'predicted_confidences': predicted_confidences
})

print(f'accuracy score: {accuracy_score(test_true, predicted_labels)}')
print(f'precision score: {precision_score(test_true, predicted_labels)}')
print(f'recall score: {recall_score(test_true, predicted_labels)}')
print(f'f1 score: {f1_score(test_true, predicted_labels)}')

final_result

accuracy score: 0.9506493506493506
precision score: 0.8854625550660793
recall score: 0.8663793103448276
f1 score: 0.8758169934640523


Unnamed: 0,test_pred,test_true,predicted_confidences
0,0,0,0.998865
1,0,0,0.998514
2,0,0,0.999071
3,0,0,0.998521
4,0,0,0.999032
...,...,...,...
1150,1,1,0.996202
1151,1,1,0.992641
1152,0,0,0.999115
1153,0,0,0.999262
