## Using Roberta saved model for prediction on dev data

In [1]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
import pandas as pd
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


1. Loadung the saved model and tokenizer

In [2]:
# Load the saved model and tokenizer
model_path = "/Users/juliamf/Desktop/CMS-CLS/winter_semester24:25/LLMs/project/SemEval/roberta_emotion_model/roberta_emotion_model_frequency_weights"
tokenizer = RobertaTokenizer.from_pretrained(model_path)
model = RobertaForSequenceClassification.from_pretrained(model_path)

# Move model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

2. Prepare the new dataset

In [3]:
# Load the new dataset
new_data_path = "/Users/juliamf/Desktop/CMS-CLS/winter_semester24:25/LLMs/project/public_data_dev/track_a/dev/eng.csv"
english_df = pd.read_csv(new_data_path)

# Extract the text and id column
texts = english_df["text"].tolist()
ids = english_df["id"]

3. Tokenize the new text

In [4]:
# Tokenize the new dataset
def preprocess_data(texts, tokenizer, max_length=128):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt"
    )
    return encodings

encodings = preprocess_data(english_df["text"].tolist(), tokenizer)

In [5]:
print(encodings)

{'input_ids': tensor([[    0,   673, 13034,  ...,     1,     1,     1],
        [    0,  2409,    38,  ...,     1,     1,     1],
        [    0,   243, 41031,  ...,     1,     1,     1],
        ...,
        [    0,  2527,   939,  ...,     1,     1,     1],
        [    0, 22491,    57,  ...,     1,     1,     1],
        [    0,   133,  2214,  ...,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


4. Perform the predictions - running the model on the new data to get predictions and applying a sigmoid activation function to convert logits to probabilities, and then threshold them to predict labels 

In [6]:
# Predict on new dataset
model.eval()
with torch.no_grad():
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    probabilities = torch.sigmoid(logits)  # Convert logits to probabilities
    predictions = (probabilities > 0.5).int()  # Apply threshold

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(
    predictions.cpu().numpy(),
    columns=["Anger", "Fear", "Joy", "Sadness", "Surprise"]
)

5. Mapping the predictions to labels

In [7]:
# Add the 'id' column from the original dataset
predictions_df["id"] = ids.values

# Rearrange columns: 'id' first, then emotion columns
final_predictions = predictions_df[["id", "Anger", "Fear", "Joy", "Sadness", "Surprise"]]

6. Saving the predictions as csv format

In [9]:
# Save to CSV or display the formatted predictions
final_predictions.to_csv("/Users/juliamf/Desktop/CMS-CLS/winter_semester24:25/LLMs/project/SemEval/roberta_emotion_model/prediction_results/frequency_weight/track_a/pred_eng.csv", index=False)
print(final_predictions)

                        id  Anger  Fear  Joy  Sadness  Surprise
0    eng_dev_track_a_00001      0     1    0        0         1
1    eng_dev_track_a_00002      1     0    0        0         0
2    eng_dev_track_a_00003      1     1    0        0         0
3    eng_dev_track_a_00004      0     1    0        0         0
4    eng_dev_track_a_00005      1     1    0        0         1
..                     ...    ...   ...  ...      ...       ...
111  eng_dev_track_a_00112      0     0    1        0         0
112  eng_dev_track_a_00113      0     1    0        1         0
113  eng_dev_track_a_00114      0     0    0        0         0
114  eng_dev_track_a_00115      0     0    0        0         1
115  eng_dev_track_a_00116      0     0    0        0         0

[116 rows x 6 columns]
