## In this notebook, we will Fine Tune a **RoBERTa** Transformer for Sentiment Analysis

 - BOUSSOU Walid
 - HINDA Abdeljebar
 - ASTIGHFAR Ismail
 - KAIS Zakaria
 - AYAD Mounir


#### Importing Python Libraries and Setting Up the Environment

In this step, we will import the necessary libraries and modules to execute our script. The libraries include:
* Pandas
* PyTorch
* PyTorch Utils for Dataset and Dataloader
* Transformers
* RoBERTa Model and Tokenizer


In [1]:
import pandas as pd
import numpy as np
import torch
import transformers
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaModel, RobertaTokenizer

import logging
logging.basicConfig(level=logging.ERROR)



#### Configuring the Device for GPU Usage:


In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

#### Loading Training Data from the 'train.tsv' File:


In [12]:
import pandas as pd
import json
df = pd.read_csv("test.csv")

In [65]:
def determine_sentiment(annotation_data):
    if pd.isna(annotation_data):
        return 5  # Default for NaN values
    elif "no enough information" in annotation_data.lower():
        return 5  # Handle direct mention

    try:
        choices = json.loads(annotation_data)['choices']
    except (json.JSONDecodeError, TypeError):
        return 5  # Handle non-JSON cases

    # Check for the presence of each sentiment category
    if "high risk" in choices:
        return 3
    elif "medium risk" in choices:
        return 2
    elif "low risk" in choices:
        return 1
    elif "method" in choices:
        return 4
    elif "supportive" in choices:
        return 6
    elif "suicide supportive" in choices:
        return 7

    return 5

# Apply the function to create/update the sentiment column
df['Sentiment'] = df['sentiment'].apply(determine_sentiment)

# Now, you can check the DataFrame
print(df[['Sentiment', 'sentiment']].head())

   Sentiment                                          sentiment
0          3  {"choices":["1 wish to be death?","2 thoughts ...
1          5                              no enough information
2          5                              no enough information
3          5                              no enough information
4          5                              no enough information


In [66]:
output_file = 'updated_sentiment_data.csv'
df.to_csv(output_file, index=False)

# If you're using Jupyter Notebook, you can download the file using the following:
from IPython.display import FileLink
FileLink(output_file)

In [67]:
train = pd.read_csv("updated_sentiment_data.csv")

In [68]:
train.shape

(1200, 21)

In [69]:
train.head()

Unnamed: 0,Author,Content,Date,Linked Messages,Message ID,ThreadTitle,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 6,...,Unnamed: 8,Unnamed: 9,annotation_id,annotator,created_at,id,lead_time,sentiment,updated_at,Sentiment
0,EscapeFromLife,Is getting married before CTB a good idea? Per...,"Jul 8, 2024 at 6:12 PM",,2589889,Marriage before CTB,,,,,...,,,7,1,2024-10-02T03:21:09.448199Z,1,6.880.799.999.999.990,"{""choices"":[""1 wish to be death?"",""2 thoughts ...",2024-10-02T03:21:09.448199Z,3
1,Traveller12724,"Bro you must be trolling, I mean who in their ...","Jul 8, 2024 at 10:05 PM",2589889.0,2590106,Marriage before CTB,,,,,...,,,8,1,2024-10-02T03:21:25.669006Z,2,10.03,no enough information,2024-10-02T03:21:25.669006Z,5
2,Myforevercharlie,I think putting someone through that is fuckin...,"Jul 8, 2024 at 10:22 PM",,2590118,Marriage before CTB,,,,,...,,,9,1,2024-10-02T03:26:36.227363Z,3,304.751,no enough information,2024-10-02T03:26:36.227363Z,5
3,rozeske,There are better ways for recovery that don't ...,"Jul 9, 2024 at 1:50 AM",,2590218,Marriage before CTB,,,,,...,,,10,1,2024-10-02T03:26:55.136421Z,4,13.271,no enough information,2024-10-02T03:26:55.136421Z,5
4,who doesn't matter,"Oh no, never that. My first rule has been to n...","Jul 9, 2024 at 2:19 AM",,2590231,Marriage before CTB,,,,,...,,,11,1,2024-10-02T03:27:13.489896Z,5,12.969,no enough information,2024-10-02T03:27:13.489896Z,5


In [70]:
train['Sentiment'].unique()

array([3, 5, 2, 1, 4])

In [71]:
train.describe()

Unnamed: 0,Message ID,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 9,annotation_id,annotator,id,Sentiment
count,1200.0,0.0,0.0,0.0,0.0,1200.0,1200.0,1200.0,1200.0
mean,2544981.0,,,,,639.440833,1.0,633.324167,4.339167
std,237638.4,,,,,366.801044,0.0,366.619022,1.267072
min,798025.0,,,,,7.0,1.0,1.0,1.0
25%,2581996.0,,,,,322.75,1.0,316.75,5.0
50%,2598496.0,,,,,646.5,1.0,640.5,5.0
75%,2599328.0,,,,,948.25,1.0,942.25,5.0
max,2664962.0,,,,,1282.0,1.0,1275.0,5.0


#### We will keep the two columns 'Phrase' (attribute) and 'Sentiment' (target):


In [72]:
new_df = train[['Content', 'Sentiment']]
new_df

Unnamed: 0,Content,Sentiment
0,Is getting married before CTB a good idea? Per...,3
1,"Bro you must be trolling, I mean who in their ...",5
2,I think putting someone through that is fuckin...,5
3,There are better ways for recovery that don't ...,5
4,"Oh no, never that. My first rule has been to n...",5
...,...,...
1195,Sorry I'll respond to replies once I'm in a sl...,3
1196,I think my CTB date will my late aug/early sep...,3
1197,I've started getting rid of my excess belongin...,3
1198,Why is it that whenever I make a serious attem...,3


#### Defining Key Variables to be Used Later in Training and Validadtion :


In [73]:
MAX_LEN = 512

TRAIN_BATCH_SIZE = 64

VALID_BATCH_SIZE = 32

LEARNING_RATE = 1e-5

tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)



#### Preparing the Dataset and Dataloader :
This class is defined to accept the Dataframe as input and generate tokenized output that is used by the Roberta model for training.


In [74]:
class SentimentData(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Content
        self.targets = self.data.Sentiment
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

#### Fraction of Data Used for Training and Validation :


In [75]:
train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
val_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("VALIDATION Dataset: {}".format(val_data.shape))

training_set = SentimentData(train_data, tokenizer, MAX_LEN)
validation_set = SentimentData(val_data, tokenizer, MAX_LEN)


FULL Dataset: (1200, 2)
TRAIN Dataset: (960, 2)
VALIDATION Dataset: (240, 2)


#### Configuring Training and Validation Parameters with Creation of Corresponding Data Loaders


In [50]:

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 4
                }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **val_params)

### Creating the Neural Network for Fine Tuning

In [76]:
class RobertaClass(torch.nn.Module):
    def __init__(self):
        super(RobertaClass, self).__init__()
        self.l1 = RobertaModel.from_pretrained("roberta-base")
        self.pre_classifier = torch.nn.Linear(768, 768)

        self.dropout = torch.nn.Dropout(0.3)

        self.classifier = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        hidden_state = output_1[0]

        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)

        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [77]:
model = RobertaClass()
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaClass(
  (l1): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

#### Loss Function and Optimizer :


In [78]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE,weight_decay=1e-5)

In [79]:
def calculate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

#### Training Function for RoBERTa Sentiment Analysis Model
Here we define a training function that trains the model on the training dataset


In [83]:
def train(epoch):

    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0

    model.train()

    for _, data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        if targets.max() >= 5:  # Cambia 5 por el número de clases
            targets = targets - 1  # Ajusta los targets si es necesario

        outputs = model(ids, mask, token_type_ids)

        loss = loss_function(outputs, targets)
        tr_loss += loss.item()

        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calculate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if _ % 5000 == 0:
            loss_step = tr_loss / nb_tr_steps
            accu_step = (n_correct * 100) / nb_tr_examples
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100) / nb_tr_examples}')
    epoch_loss = tr_loss / nb_tr_steps
    epoch_accu = (n_correct * 100) / nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [84]:
EPOCHS = 5
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]

Valores únicos en targets: tensor([1, 2, 3, 5])
Training Loss per 5000 steps: 1.6456586122512817
Training Accuracy per 5000 steps: 3.125


1it [00:37, 37.20s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


2it [01:09, 34.57s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


3it [01:43, 34.04s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


4it [02:16, 33.67s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


5it [02:49, 33.33s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


6it [03:22, 33.50s/it]

Valores únicos en targets: tensor([1, 2, 3, 4, 5])


7it [03:56, 33.55s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


8it [04:30, 33.62s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


9it [05:03, 33.59s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


10it [05:37, 33.46s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


11it [06:10, 33.54s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


12it [06:43, 33.31s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


13it [07:16, 33.22s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


14it [07:50, 33.38s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


15it [08:24, 33.65s/it]

The Total Accuracy for Epoch 0: 50.9375
Training Loss Epoch: 1.5077102502187094
Training Accuracy Epoch: 50.9375



0it [00:00, ?it/s]

Valores únicos en targets: tensor([1, 2, 3, 5])
Training Loss per 5000 steps: 1.2731609344482422
Training Accuracy per 5000 steps: 76.5625


1it [00:45, 45.22s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


2it [01:19, 38.65s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


3it [01:53, 36.42s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


4it [02:26, 35.26s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


5it [03:00, 34.68s/it]

Valores únicos en targets: tensor([1, 3, 5])


6it [03:33, 34.06s/it]

Valores únicos en targets: tensor([1, 3, 5])


7it [04:06, 33.79s/it]

Valores únicos en targets: tensor([1, 2, 3, 4, 5])


8it [04:38, 33.43s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


9it [05:12, 33.37s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


10it [05:45, 33.38s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


11it [06:19, 33.42s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


12it [06:53, 33.63s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


13it [07:26, 33.54s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


14it [08:00, 33.55s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


15it [08:34, 34.31s/it]

The Total Accuracy for Epoch 1: 75.9375
Training Loss Epoch: 1.017862566312154
Training Accuracy Epoch: 75.9375



0it [00:00, ?it/s]

Valores únicos en targets: tensor([1, 2, 3, 4, 5])
Training Loss per 5000 steps: 0.8453186750411987
Training Accuracy per 5000 steps: 78.125


1it [00:45, 45.48s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


2it [01:20, 39.45s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


3it [01:54, 37.01s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


4it [02:27, 35.48s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


5it [03:01, 34.82s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


6it [03:35, 34.45s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


7it [04:09, 34.36s/it]

Valores únicos en targets: tensor([1, 3, 5])


8it [04:42, 33.94s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


9it [05:16, 33.82s/it]

Valores únicos en targets: tensor([1, 3, 5])


10it [05:49, 33.72s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


11it [06:23, 33.75s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


12it [06:57, 33.78s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


13it [07:30, 33.65s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


14it [08:04, 33.73s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


15it [08:39, 34.65s/it]

The Total Accuracy for Epoch 2: 75.9375
Training Loss Epoch: 0.8081863522529602
Training Accuracy Epoch: 75.9375



0it [00:00, ?it/s]

Valores únicos en targets: tensor([1, 2, 3, 5])
Training Loss per 5000 steps: 0.6837695837020874
Training Accuracy per 5000 steps: 78.125


1it [00:45, 45.72s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


2it [01:20, 39.16s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


3it [01:53, 36.47s/it]

Valores únicos en targets: tensor([2, 3, 5])


4it [02:27, 35.42s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


5it [03:01, 34.94s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


6it [03:34, 34.46s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


7it [04:07, 33.97s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


8it [04:41, 33.80s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


9it [05:14, 33.61s/it]

Valores únicos en targets: tensor([1, 2, 3, 4, 5])


10it [05:48, 33.61s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


11it [06:21, 33.48s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


12it [06:55, 33.55s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


13it [07:28, 33.59s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


14it [08:02, 33.56s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


15it [08:36, 34.45s/it]

The Total Accuracy for Epoch 3: 75.9375
Training Loss Epoch: 0.7228970209757487
Training Accuracy Epoch: 75.9375



0it [00:00, ?it/s]

Valores únicos en targets: tensor([1, 2, 3, 5])
Training Loss per 5000 steps: 0.7032690644264221
Training Accuracy per 5000 steps: 73.4375


1it [00:45, 45.71s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


2it [01:20, 39.53s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


3it [01:54, 36.96s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


4it [02:28, 35.65s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


5it [03:02, 34.95s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


6it [03:35, 34.48s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


7it [04:09, 34.30s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


8it [04:43, 34.25s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


9it [05:18, 34.26s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


10it [05:52, 34.18s/it]

Valores únicos en targets: tensor([1, 2, 3, 4, 5])


11it [06:26, 34.17s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


12it [06:59, 33.98s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


13it [07:33, 33.97s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


14it [08:07, 33.91s/it]

Valores únicos en targets: tensor([1, 2, 3, 5])


15it [08:42, 34.85s/it]

The Total Accuracy for Epoch 4: 77.70833333333333
Training Loss Epoch: 0.6512927611668905
Training Accuracy Epoch: 77.70833333333333





#### Validation Function for RoBERTa Sentiment Analysis Model
During the validation stage we pass the unseen data(Validation Dataset) to the model. This step determines how good the model performs on the unseen data.

In [87]:
def valid(model, validation_loader):


    model.eval()

    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0


    with torch.no_grad():
        for _, data in tqdm(enumerate(validation_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            if targets.max() >= 5:  # Cambia 5 por el número de clases
              targets = targets - 1  # Ajusta los targets si es necesario

            outputs = model(ids, mask, token_type_ids).squeeze()


            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calculate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)


            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")


    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

In [88]:
acc = valid(model, validation_loader)
print("Accuracy on test data = %0.2f%%" % acc)

1it [00:07,  7.30s/it]

Validation Loss per 100 steps: 0.52094966173172
Validation Accuracy per 100 steps: 78.125


8it [00:53,  6.73s/it]

Validation Loss Epoch: 0.5795761868357658
Validation Accuracy Epoch: 78.75
Accuracy on test data = 78.75%





## Save the model

In [89]:
import torch

# Assuming 'model' is your trained model
torch.save(model.state_dict(), 'model.pth')

In [90]:
from google.colab import files

files.download('model.pth')  # or 'model.h5'

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>