In [None]:
!pip install tweet-preprocessor

In [6]:
import pandas as pd
import re, string
import numpy as np

In [7]:
import random, torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [8]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [9]:
path_train = "/kaggle/input/hope-task-2-spanish/train_spanish.csv"
path_val = "/kaggle/input/hope-task-2-spanish/val_spanish.csv"
path_test = "/kaggle/input/hope-task-2-spanish/test_spanish.csv"

# Read dataset

In [10]:
import string,re
def preprocessing_text(text):
    text = p.clean(text)

    text = text.strip()
    text = text.translate(text.maketrans('', '', string.punctuation.replace("_","")))
    text = re.sub('\\s+',' ',text).strip()
    return text

## Preprocessing


In [12]:
import pandas as pd

def convert_label(text):
    if text == "Hope":
        return 1
    else:
        return 0

def read_and_preprocessing(path_data):
    df = pd.read_csv(path_data)
    df["binary"] = df["binary"].apply(convert_label)
    x_input = df["text"].apply(preprocessing_text).tolist()
    y_output = df["binary"].tolist()
    ids = df["id"].tolist()
    return x_input,y_output,ids

train_texts, train_labels,train_ids = read_and_preprocessing(path_train)
valid_texts,valid_labels,valid_ids = read_and_preprocessing(path_val)
print(len(train_texts),len(train_labels))
print(len(valid_texts),len(valid_labels))

6903 6903
1150 1150


In [13]:
df_train = pd.DataFrame(list(zip(train_texts, train_labels)),
               columns =['x_data', 'y_output'])
df_train.head()

Unnamed: 0,x_data,y_output
0,Mientras me persigno y le rezo a la Virgen del...,0
1,No Yo ya no estoy para esperar Ni para rogar n...,0
2,Creo que estamos ante el mejor episodio de est...,1
3,Bueno ojalá q cuando llegue este fin de semana...,1
4,USER USER Marcos de boludo no tiene un pelo pe...,0


# Build Filter Classifier

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "microsoft/deberta-v3-base" #Try different models here 

bert_model = AutoModelForSequenceClassification.from_pretrained(model_name, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length=512, ignore_mismatched_sizes=True)

In [15]:
max_length = 512
train_encodings = tokenizer(train_texts, truncation=True, max_length=max_length, padding=True)

In [16]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)

In [17]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    learning_rate=2e-5,
    per_device_train_batch_size=32,  # batch size per device during training
    warmup_steps=100,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    save_total_limit = 1,
    report_to="tensorboard"
)

trainer = Trainer(
    model=bert_model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
)
trainer.train()
# API: fac85ccacc3dffb183116aba932e6bcc08010443

2024-04-16 17:32:38.350182: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-16 17:32:38.350277: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-16 17:32:38.488668: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
100,0.6351
200,0.4929
300,0.3878
400,0.3876
500,0.3229
600,0.297
700,0.2434
800,0.2302
900,0.1995
1000,0.1443


TrainOutput(global_step=2160, training_loss=0.19965193867683412, metrics={'train_runtime': 906.2221, 'train_samples_per_second': 76.173, 'train_steps_per_second': 2.384, 'total_flos': 4327796582974800.0, 'train_loss': 0.19965193867683412, 'epoch': 10.0})

# Inference

In [18]:
def make_prediction(review,tokenizer,trainer):
    demo_input = preprocessing_text(review)
    demo_encodings = tokenizer([demo_input], truncation=True, max_length = max_length, padding=True)
    test_dataset = CustomDataset(demo_encodings, [0])
    predic_demo = trainer.predict(test_dataset)[0]
    predict_label = np.argmax(predic_demo, axis=1).flatten().tolist()[0]
    return predict_label

In [None]:
y_pred = []

for review in valid_texts:
    a = make_prediction(review,tokenizer,trainer)
    y_pred.append(a)
print(y_pred[:10])
print(valid_labels[:10])

In [20]:
from sklearn.metrics import *

print("M_Pr: ", round(precision_score(valid_labels, y_pred, average='macro'),4))
print("M_Re: ", round(recall_score(valid_labels, y_pred, average='macro'),4))
print("M_F1: ", round(f1_score(valid_labels, y_pred, average='macro'),4))

print("W_Pr: ", round(precision_score(valid_labels, y_pred, average='weighted'),4))
print("W_Re: ", round(recall_score(valid_labels, y_pred, average='weighted'),4))
print("W_F1: ", round(f1_score(valid_labels, y_pred, average='weighted'),4))

print("acc:", round(accuracy_score(valid_labels, y_pred), 4))

M_Pr:  0.791
M_Re:  0.8099
M_F1:  0.7988
W_Pr:  0.8316
W_Re:  0.8235
W_F1:  0.8262
acc: 0.8235


In [21]:
# # Calculate precision, recall, and F1-score for different averaging methods
# M_Pr = round(precision_score(valid_labels, y_pred, average='macro'), 4)
# M_Re = round(recall_score(valid_labels, y_pred, average='macro'), 4)
# M_F1 = round(f1_score(valid_labels, y_pred, average='macro'), 4)

# W_Pr = round(precision_score(valid_labels, y_pred, average='weighted'), 4)
# W_Re = round(recall_score(valid_labels, y_pred, average='weighted'), 4)
# W_F1 = round(f1_score(valid_labels, y_pred, average='weighted'), 4)

# acc = round(accuracy_score(valid_labels, y_pred), 4)

# # Create a string with the formatted output
# output_string = f"M_Pr: {M_Pr}\n" \
#                 f"M_Re: {M_Re}\n" \
#                 f"M_F1: {M_F1}\n" \
#                 f"W_Pr: {W_Pr}\n" \
#                 f"W_Re: {W_Re}\n" \
#                 f"W_F1: {W_F1}\n" \
#                 f"acc: {acc}\n"

# # Save the output to a text file named "scores.txt"
# file_path = "my_scores.txt"
# with open(file_path, "w") as f:
#     f.write(output_string)

# Submission for Validation datasets

In [22]:
# def convert2category(y_pred):
#     y_label = []
#     for y in y_pred:
#         if y == 0:
#               y_label.append("Not Hope")
#         else:
#               y_label.append("Hope")
#     return y_label

In [23]:
# name_sub = "predictions.csv"
# y_pred_label = convert2category(y_pred)
# df_sub = pd.DataFrame(list(zip(valid_ids, y_pred_label)),
#                columns =['id', 'category'])
# df_sub.to_csv(name_sub)
# df_sub.head()

# Submission for Test datasets

In [24]:
def read_and_preprocessing_for_test(path_data):
    df = pd.read_csv(path_data)
    x_input = df["text"].apply(preprocessing_text).tolist()
    ids = df["id"].tolist()
    return x_input,ids

test_texts,test_ids = read_and_preprocessing_for_test(path_test)
print(len(test_texts))

def make_prediction(review,tokenizer,trainer):
    demo_input = preprocessing_text(review)
    demo_encodings = tokenizer([demo_input], truncation=True, max_length = max_length, padding=True)
    test_dataset = CustomDataset(demo_encodings, [0])
    predic_demo = trainer.predict(test_dataset)[0]
    predict_label = np.argmax(predic_demo, axis=1).flatten().tolist()[0]
    return predict_label

1152


In [None]:
y_pred_test = []

for review in test_texts:
    a = make_prediction(review,tokenizer,trainer)
    y_pred_test.append(a)

In [26]:
def convert2category(y_pred):
    y_label = []
    for y in y_pred:
        if y == 0:
              y_label.append("Not Hope")
        else:
              y_label.append("Hope")
    return y_label

In [27]:
name_sub = "predictions.csv"
y_pred_label = convert2category(y_pred_test)
df_sub = pd.DataFrame(list(zip(test_ids, y_pred_label)),
               columns =['id', 'category'])
df_sub.to_csv(name_sub)
df_sub.head()

Unnamed: 0,id,category
0,3020,Not Hope
1,7123,Not Hope
2,5378,Not Hope
3,8396,Not Hope
4,7885,Hope
