<a href="https://colab.research.google.com/github/khodozzz/portfolio/blob/main/3_Twitter_Sentiment_Analysis_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets --quiet

# Loading

In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_df = pd.read_csv('twitter_training.csv', 
                       names=["id", "entity", "sentiment", "text"],
                       on_bad_lines='skip')
train_df = train_df.dropna()
train_df

Unnamed: 0,id,entity,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [None]:
test_df = pd.read_csv('twitter_validation.csv', 
                       names=["id", "entity", "sentiment", "text"],
                       on_bad_lines='skip')
test_df = test_df.dropna()
test_df

Unnamed: 0,id,entity,sentiment,text
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...
...,...,...,...,...
995,4891,GrandTheftAuto(GTA),Irrelevant,⭐️ Toronto is the arts and culture capital of ...
996,4359,CS-GO,Irrelevant,tHIS IS ACTUALLY A GOOD MOVE TOT BRING MORE VI...
997,2652,Borderlands,Positive,Today sucked so it’s time to drink wine n play...
998,8069,Microsoft,Positive,Bought a fraction of Microsoft today. Small wins.


In [None]:
labels = list(train_df['sentiment'].unique())
id2label = {k:v for k,v in enumerate(labels)}
label2id = {v:k for k,v in enumerate(labels)}

In [None]:
train_df['label'] = train_df['sentiment'].map(label2id)
test_df['label'] = test_df['sentiment'].map(label2id)

In [None]:
train = train_df.sample(frac=0.8, random_state=42)
valid = train_df.drop(train.index)

train_ds = Dataset.from_pandas(train)
valid_ds = Dataset.from_pandas(valid)
test_ds = Dataset.from_pandas(test_df)

train_ds, valid_ds, test_ds

(Dataset({
     features: ['id', 'entity', 'sentiment', 'text', 'label', '__index_level_0__'],
     num_rows: 59197
 }),
 Dataset({
     features: ['id', 'entity', 'sentiment', 'text', 'label', '__index_level_0__'],
     num_rows: 14799
 }),
 Dataset({
     features: ['id', 'entity', 'sentiment', 'text', 'label'],
     num_rows: 1000
 }))

# Proprocess

In [None]:
!pip install transformers==4.27 --quiet
!pip install --upgrade accelerate --quiet

In [None]:
from datasets import load_metric

from transformers import (AutoTokenizer, 
                          DataCollatorWithPadding,
                          AutoModelForSequenceClassification, 
                          TrainingArguments, 
                          Trainer)

In [None]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
accuracy = load_metric('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

train_ds = train_ds.map(preprocess_function, batched=True)
valid_ds = valid_ds.map(preprocess_function, batched=True)
test_ds = test_ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/59197 [00:00<?, ? examples/s]

Map:   0%|          | 0/14799 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(labels), id2label=id2label, label2id=label2id, 
    ignore_mismatched_sizes=True, # The model was pretrained with 3 labels, but our dataset has 4 labels (a new head with random weights will be initialized)
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([4, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([4]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="twitter-sentiment-detector",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=6,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Predicition

In [None]:
predictions = trainer.predict(test_ds)
preds = np.argmax(predictions.predictions, axis=-1)
accuracy.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.956}