In [None]:
import pandas as pd
import re, string
import numpy as np

In [None]:
import random, torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [None]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

In [None]:
path_train = "/kaggle/input/dataset-cs221-preprocessed/final_train_en.csv"
path_val = "/kaggle/input/dataset-cs221-preprocessed/final_val_en.csv"
path_test = "/kaggle/input/dataset-cs221-preprocessed/final_test_en_labeled.csv"

# Read dataset

In [None]:
import string,re
def preprocessing_text(text):
    text = text.strip()
    text = text.translate(text.maketrans('', '', string.punctuation.replace("_","")))
    text = re.sub('\\s+',' ',text).strip()
    return text

In [None]:
# def preprocessing_text(text):
    
#     return text

In [None]:
import pandas as pd

def convert_label(text):
    if text == "Not Hope":
        return 0
    elif text == "Generalized Hope":
        return 1
    elif text == "Unrealistic Hope":
        return 2
    elif text == "Realistic Hope":
        return 3
    else:
        print("Error: ", text)
        return 0
    

def read_and_preprocessing(path_data):
    df = pd.read_csv(path_data)
    df["multiclass"] = df["multiclass"].apply(convert_label)
    x_input = df["text"].apply(preprocessing_text).tolist()
    y_output = df["multiclass"].tolist()
    ids = df["id"].tolist()
    return x_input,y_output,ids

train_texts, train_labels,train_ids = read_and_preprocessing(path_train)
valid_texts,valid_labels,valid_ids = read_and_preprocessing(path_val)
print(len(train_texts),len(train_labels))
print(len(valid_texts),len(valid_labels))

In [None]:
df_train = pd.DataFrame(list(zip(train_texts, train_labels)),
               columns =['x_data', 'y_output'])
df_train.head()

# Build Filter Classifier

In [None]:
### import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment-multilingual"
# model_name = "FacebookAI/xlm-roberta-base"

bert_model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=4, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512, ignore_mismatched_sizes=True)

In [None]:
max_length = 512
train_encodings = tokenizer(train_texts, truncation=True, max_length=max_length, padding=True)

In [None]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    learning_rate=2e-5,
    per_device_train_batch_size=32,  # batch size per device during training
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    save_total_limit = 1,
    report_to="tensorboard"
)
 
trainer = Trainer(
    model=bert_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)
trainer.train()
    # API: fac85ccacc3dffb183116aba932e6bcc08010443

# Inference (with validation datasets)

In [None]:
def make_prediction(review,tokenizer,trainer):
    demo_input = preprocessing_text(review)
    demo_encodings = tokenizer([demo_input], truncation=True, max_length = max_length, padding=True)
    test_dataset = CustomDataset(demo_encodings, [0])
    predic_demo = trainer.predict(test_dataset)[0]
    predict_label = np.argmax(predic_demo, axis=1).flatten().tolist()[0]
    return predict_label

In [None]:
import numpy as np
y_pred = []

for review in valid_texts:
    a = make_prediction(review,tokenizer,trainer)
    y_pred.append(a)
print(y_pred[:10])
print(valid_labels[:10])

In [None]:
from sklearn.metrics import *

print("M_Pr: ", round(precision_score(valid_labels, y_pred, average='macro'),4))
print("M_Re: ", round(recall_score(valid_labels, y_pred, average='macro'),4))
print("M_F1: ", round(f1_score(valid_labels, y_pred, average='macro'),4))

print("W_Pr: ", round(precision_score(valid_labels, y_pred, average='weighted'),4))
print("W_Re: ", round(recall_score(valid_labels, y_pred, average='weighted'),4))
print("W_F1: ", round(f1_score(valid_labels, y_pred, average='weighted'),4))

print("acc:", round(accuracy_score(valid_labels, y_pred), 4))

# Evaluation on Test datasets

In [None]:
test_texts, test_labels,test_ids = read_and_preprocessing(path_test)

y_pred_test = []

for review in test_texts:
    a = make_prediction(review,tokenizer,trainer)
    y_pred.append(a)
print(y_pred_test[:10])
print(test_labels[:10])

In [None]:
from sklearn.metrics import *

print("M_Pr: ", round(precision_score(test_labels, y_pred_test, average='macro'),4))
print("M_Re: ", round(recall_score(test_labels, y_pred_test, average='macro'),4))
print("M_F1: ", round(f1_score(test_labels, y_pred_test, average='macro'),4))

print("W_Pr: ", round(precision_score(test_labels, y_pred_test, average='weighted'),4))
print("W_Re: ", round(recall_score(test_labels, y_pred_test, average='weighted'),4))
print("W_F1: ", round(f1_score(test_labels, y_pred_test, average='weighted'),4))

print("acc:", round(accuracy_score(test_labels, y_pred_test), 4))