In [1]:
!nvidia-smi

Thu Nov  7 14:27:26 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla P100-PCIE-16GB           Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P0             27W /  250W |       0MiB /  16384MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                     

In [2]:
!pip install transformers



In [3]:
import torch

import torch.nn as nn

import pandas as pd

import numpy as np

import shutil

import sys   

import os
import gc
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings
from tqdm import tqdm
# Ẩn tất cả các cảnh báo
warnings.filterwarnings("ignore")


In [4]:
# Tiền xử lý dữ liệu
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\W+', ' ', sentence)
    words = sentence.split()
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load train file

In [5]:
train_path = r'/kaggle/input/sem-eval-2025/public_data/train/track_a/eng.csv'

print(train_path)
# Đảm bảo rằng tqdm có thể hoạt động với pandas
tqdm.pandas()

train_df = pd.read_csv(train_path)
# for idx in tqdm(range(len(train_df['text']))):
#     train_df['text'].iloc[idx]=preprocess(train_df['text'].iloc[idx])

train_df['text']=train_df['text'].apply(preprocess)

train_df.head()

print(train_df.shape)

print(train_df.columns)

/kaggle/input/sem-eval-2025/public_data/train/track_a/eng.csv
(2768, 7)
Index(['id', 'text', 'Anger', 'Fear', 'Joy', 'Sadness', 'Surprise'], dtype='object')


In [6]:
target_list = ['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']

In [7]:
# hyperparameters



MAX_LEN = 128



TRAIN_BATCH_SIZE = 32



VALID_BATCH_SIZE = 32



EPOCHS = 30



LEARNING_RATE = 1e-06

In [8]:
from transformers import BertTokenizer, BertModel,AutoTokenizer, AutoModelForSequenceClassification

In [9]:
derberta_tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')

#bertweet_tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

In [10]:
import torch



class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):

        self.tokenizer = tokenizer

        self.df = df

        self.texts = df['text']  # Lấy cột văn bản

        self.targets = df[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']].values  # Lấy các cột nhãn

        self.max_len = max_len



    def __len__(self):

        return len(self.texts)



    def __getitem__(self, index):

        text = str(self.texts.iloc[index])

        text = " ".join(text.split()) 



        inputs = self.tokenizer.encode_plus(

            text,

            None,

            add_special_tokens=True,

            max_length=self.max_len,

            padding='max_length',

            return_token_type_ids=True,

            truncation=True,

            return_attention_mask=True,

            return_tensors='pt'

        )



        return {

            'input_ids': inputs['input_ids'].flatten(),

            'attention_mask': inputs['attention_mask'].flatten(),

            'token_type_ids': inputs["token_type_ids"].flatten(),

            'targets': torch.FloatTensor(self.targets[index])  # Chuyển đổi nhãn thành tensor

        }


In [11]:
train_size = 0.8



from sklearn.model_selection import train_test_split



train_df, val_df = train_test_split(train_df, test_size=1-train_size, random_state=42)



# Bây giờ bạn có thể kiểm tra val_df

print(train_df.head())

                           id  \
2124  eng_train_track_a_02125   
2716  eng_train_track_a_02717   
2232  eng_train_track_a_02233   
261   eng_train_track_a_00262   
2059  eng_train_track_a_02060   

                                                   text  Anger  Fear  Joy  \
2124  need ice calf often tri stay except occasion s...      0     1    0   
2716                              rememb scare shitless      0     1    0   
2232           began feel familiar feel heart rip chest      0     1    0   
261                                        ill pay back      0     0    1   
2059                                settl dri eye sleev      0     0    0   

      Sadness  Surprise  
2124        0         0  
2716        0         0  
2232        1         0  
261         0         0  
2059        1         0  


In [12]:
# ensemble = EnsembleModelForSequenceClassification.from_multiple_pretrained(
#     "bert-base-uncased", "distilroberta-base", "xlnet-base-cased"
# )
train_dataset = CustomDataset(train_df, derberta_tokenizer, MAX_LEN)



valid_dataset = CustomDataset(val_df, derberta_tokenizer, MAX_LEN)

In [13]:
# train_data_loader['input_ids']

val_df.head()

Unnamed: 0,id,text,Anger,Fear,Joy,Sadness,Surprise
1378,eng_train_track_a_01379,smoke weed alon tendenc becom hungri,0,0,1,0,0
839,eng_train_track_a_00840,noth fine grey tan sand far eye could see,0,0,0,1,0
2164,eng_train_track_a_02165,even drive back money hotel,0,1,0,1,0
2619,eng_train_track_a_02620,never freak happen enough small got use,0,1,0,0,0
927,eng_train_track_a_00928,damag done scar broken collar bone,0,1,0,1,0


In [14]:
train_data_loader = torch.utils.data.DataLoader(train_dataset, 



    batch_size=TRAIN_BATCH_SIZE,



    shuffle=True,



    num_workers=0



)







val_data_loader = torch.utils.data.DataLoader(valid_dataset, 



    batch_size=VALID_BATCH_SIZE,



    shuffle=False,



    num_workers=0



)

In [15]:
device1 = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
#device2 = torch.device('cuda:1') if torch.cuda.is_available() else torch.device('cpu')

In [16]:
def load_ckp(checkpoint_fpath, model, optimizer):



    """



    checkpoint_path: path to save checkpoint



    model: model that we want to load checkpoint parameters into       



    optimizer: optimizer we defined in previous training



    """



    # load check point



    checkpoint = torch.load(checkpoint_fpath)



    # initialize state_dict from checkpoint to model



    model.load_state_dict(checkpoint['state_dict'])



    # initialize optimizer from checkpoint to optimizer



    optimizer.load_state_dict(checkpoint['optimizer'])



    # initialize valid_loss_min from checkpoint to valid_loss_min



    valid_loss_min = checkpoint['valid_loss_min']



    # return model, optimizer, epoch value, min validation loss 



    return model, optimizer, checkpoint['epoch'], valid_loss_min







def save_ckp(state, is_best, checkpoint_path, best_model_path):



    """



    state: checkpoint we want to save



    is_best: is this the best checkpoint; min validation loss



    checkpoint_path: path to save checkpoint



    best_model_path: path to save best model



    """



    f_path = checkpoint_path



    # save checkpoint data to the path given, checkpoint_path



    torch.save(state, f_path)



    # if it is a best model, min validation loss



    if is_best:



        best_fpath = best_model_path



        # copy that checkpoint file to best path given, best_model_path



        shutil.copyfile(f_path, best_fpath)

In [17]:
!pip install setfit

Collecting setfit
  Downloading setfit-1.1.0-py3-none-any.whl.metadata (12 kB)
Collecting sentence-transformers>=3 (from sentence-transformers[train]>=3->setfit)
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting evaluate>=0.3.0 (from setfit)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading setfit-1.1.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.2/75.2 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers, evaluate, setfit
Successfully installed evaluate

In [18]:
from transformers import AutoModel
from setfit import SetFitModel


class DEBERTClass(nn.Module):

    def __init__(self):
        super(DEBERTClass, self).__init__()

        # Load the pretrained BERT model without a classification head
        self.bert_model = AutoModel.from_pretrained('microsoft/deberta-v3-base')
        #self.bert_model = SetFitModel.from_pretrained("HelgeKn/SemEval-multi-label-v2")

        # Define a custom classifier with multiple linear layers
        self.classifier = nn.Sequential(
            nn.Linear(768, 5),  # First linear layer
            nn.Dropout(0.5),      # Dropout for regularization
        )

    def forward(self, input_ids, attn_mask, token_type_ids):
        # Get the last hidden state from the BERT model
        bert_output = self.bert_model(input_ids=input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
        
        # Take the output of the [CLS] token (first token) for classification
        cls_output = bert_output.last_hidden_state[:, 0, :]

        # Pass through the custom classifier to get logits
        logits = self.classifier(cls_output)

        return logits



# Instantiate the model and move it to the device

derberta_model = DEBERTClass()

derberta_model.to(device1)


  self.comm = Comm(**args)


pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

DEBERTClass(
  (bert_model): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-11): 12 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
             

In [19]:
class BERTWEETClass(nn.Module):

    def __init__(self):
        super(BERTWEETClass, self).__init__()

        # Load the pretrained BERT model without a classification head
        self.bert_model = AutoModel.from_pretrained('vinai/bertweet-base')
        #self.bert_model = SetFitModel.from_pretrained("HelgeKn/SemEval-multi-label-v2")

        # Define a custom classifier with multiple linear layers
        self.classifier = nn.Sequential(
            nn.Linear(768, 5),  # First linear layer
            nn.Dropout(0.5),      # Dropout for regularization
        )

    def forward(self, input_ids, attn_mask, token_type_ids):
        # Get the last hidden state from the BERT model
        bert_output = self.bert_model(input_ids=input_ids, attention_mask=attn_mask, token_type_ids=token_type_ids)
        
        # Take the output of the [CLS] token (first token) for classification
        cls_output = bert_output.last_hidden_state[:, 0, :]

        # Pass through the custom classifier to get logits
        logits = self.classifier(cls_output)

        return logits





In [20]:
def loss_fn(outputs, targets):



    return torch.nn.BCEWithLogitsLoss()(outputs, targets)







de_optimizer = torch.optim.Adam(params =  derberta_model.parameters(), lr=LEARNING_RATE)


In [21]:
val_targets=[]



val_outputs=[]

In [22]:
def train_model(n_epochs, training_loader, validation_loader, model, 



                optimizer, checkpoint_path, best_model_path):

  # initialize tracker for minimum validation loss

  valid_loss_min = np.Inf

  for epoch in range(1, n_epochs+1):

    train_loss = 0

    valid_loss = 0
    min_loss=np.Inf
    early_stopping=0
    model.train()

    print('############# Epoch {}: Training Start   #############'.format(epoch))

    for batch_idx, data in enumerate(training_loader):

        #print('yyy epoch', batch_idx)

        ids = data['input_ids'].to(device1, dtype = torch.long)

        mask = data['attention_mask'].to(device1, dtype = torch.long)

        token_type_ids = data['token_type_ids'].to(device1, dtype = torch.long)

        targets = data['targets'].to(device1, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)



        optimizer.zero_grad()

        loss = loss_fn(outputs, targets)
        if loss<min_loss:
            min_loss=loss
            early_stopping=0
        else:
            early_stopping+=1
            if early_stopping>200:
                print('Stopping due to early stopping\n')
                break



        if batch_idx%2000==0:

            print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')



        



        optimizer.zero_grad()



        loss.backward()



        optimizer.step()



        #print('before loss data in training', loss.item(), train_loss)



        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))



        #print('after loss data in training', loss.item(), train_loss)



    



    print('############# Epoch {}: Training End     #############'.format(epoch))



    



    print('############# Epoch {}: Validation Start   #############'.format(epoch))

    ######################    



    # validate the model #



    ######################



 



    model.eval()



   



    with torch.no_grad():



      for batch_idx, data in enumerate(validation_loader, 0):



            ids = data['input_ids'].to(device1, dtype = torch.long)



            mask = data['attention_mask'].to(device1, dtype = torch.long)



            token_type_ids = data['token_type_ids'].to(device1, dtype = torch.long)



            targets = data['targets'].to(device1, dtype = torch.float)



            outputs = model(ids, mask, token_type_ids)







            loss = loss_fn(outputs, targets)



            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))



            val_targets.extend(targets.cpu().detach().numpy().tolist())



            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())







      print('############# Epoch {}: Validation End     #############'.format(epoch))



      # calculate average losses



      #print('before cal avg train loss', train_loss)



      train_loss = train_loss/len(training_loader)



      valid_loss = valid_loss/len(validation_loader)



      # print training/validation statistics 



      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(



            epoch, 



            train_loss,



            valid_loss



            ))



      



      # create checkpoint variable and add important data



      checkpoint = {



            'epoch': epoch + 1,



            'valid_loss_min': valid_loss,



            'state_dict': model.state_dict(),



            'optimizer': optimizer.state_dict()



      }

        # save checkpoint



      save_ckp(checkpoint, False, checkpoint_path, best_model_path)



        



      ## TODO: save the model if validation loss has decreased



      if valid_loss <= valid_loss_min:



        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,valid_loss))



        # save checkpoint as best model



        save_ckp(checkpoint, True, checkpoint_path, best_model_path)


        valid_loss_min = valid_loss







    print('############# Epoch {}  Done   #############\n'.format(epoch))







  return model

In [23]:
de_ckpt_path = "/kaggle/working/de_curr_ckpt"



de_best_model_path = "/kaggle/working/deberta_best_model.pt"
#be_best_model_path = "/kaggle/working/bertweet_best_model.pt"

# TRAIN DEBERTA

In [24]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, derberta_model, de_optimizer, de_ckpt_path, de_best_model_path)

############# Epoch 1: Training Start   #############
Epoch: 1, Training Loss:  0.7786495089530945
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.010627 	Average Validation Loss: 0.039734
Validation loss decreased (inf --> 0.039734).  Saving model ...
############# Epoch 1  Done   #############

############# Epoch 2: Training Start   #############
Epoch: 2, Training Loss:  0.7113916873931885
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.009812 	Average Validation Loss: 0.035314
Validation loss decreased (0.039734 --> 0.035314).  Saving model ...
############# Epoch 2  Done   #############

############# Epoch 3: Training Start   #############
Epoch: 3, Training Loss:  0.640214622

In [25]:
import torch

import numpy as np

import pandas as pd



# Assuming you have the CustomDataset and other necessary parts defined elsewhere

test_path = '/kaggle/input/sem-eval-2025/public_data/dev/track_a/eng_a.csv'

test_df = pd.read_csv(test_path)
# for idx in tqdm(range(len(test_df['text']))):
#     test_df['text'].iloc[idx]=preprocess(test_df['text'].iloc[idx])
test_df['text']=test_df['text'].apply(preprocess)


test_dataset = CustomDataset(test_df, derberta_tokenizer, MAX_LEN)

test_data_loader = torch.utils.data.DataLoader(

    test_dataset,

    batch_size=VALID_BATCH_SIZE,

    shuffle=False,

    num_workers=0

)



# Load the trained model

model = DEBERTClass()  # Replace with your actual model class

checkpoint_path = "/kaggle/working/deberta_best_model.pt"  # Replace with your actual checkpoint path

model, optimizer, start_epoch, valid_loss_min = load_ckp(checkpoint_path, model, de_optimizer)

In [26]:
# Define inference function

def inference(testing_loader, model):

    model.eval()

    final_predictions = []

    

    with torch.no_grad():

        for _, data in enumerate(testing_loader, 0):

            # Move data to the GPU

            ids = data['input_ids'].to(device1, dtype=torch.long)

            mask = data['attention_mask'].to(device1, dtype=torch.long)

            token_type_ids = data['token_type_ids'].to(device1, dtype=torch.long)

            

            # Ensure model is also on the same device (in this case, GPU)

            model.to(device1)

            

            # Get the model predictions

            outputs = model(input_ids=ids, attn_mask=mask, token_type_ids=token_type_ids)

            

            # Apply sigmoid to get probabilities

            probs = torch.sigmoid(outputs).cpu().detach().numpy()  # Ensure outputs are moved to CPU for numpy compatibility

            

            # Apply threshold to convert probabilities to binary predictions (0 or 1)

            preds = (probs > 0.5).astype(int)

            

            # Store predictions

            final_predictions.extend(preds)

    

    return np.array(final_predictions)





# Perform inference on the test data

predictions1 = inference(test_data_loader, model)



# Update the test_df with the predictions

test_df[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']] = predictions1



# Save the updated DataFrame to a CSV file
test_df=test_df.drop(columns=['text'])
test_df.to_csv('pred_eng_a_deberta.csv', index=False)

print("Inference results saved to pred_eng_a_deberta.csv")

Inference results saved to pred_eng_a_deberta.csv


In [27]:
del model, optimizer, start_epoch, valid_loss_min,trained_model
gc.collect()

12

# TRAIN BERTWEET

In [28]:
# # Instantiate the model and move it to the device

# bertweet_model = BERTWEETClass()

# bertweet_model.to(device1)
# be_optimizer = torch.optim.Adam(params =  bertweet_model.parameters(), lr=LEARNING_RATE)

In [29]:
# val_targets=[]



# val_outputs=[]

In [30]:
# be_ckpt_path = "/kaggle/working/be_curr_ckpt"
# be_best_model_path = "/kaggle/working/bertweet_best_model.pt"

In [31]:
# trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, bertweet_model, be_optimizer, be_ckpt_path, be_best_model_path)

In [32]:


# # Assuming you have the CustomDataset and other necessary parts defined elsewhere

# test_path = '/kaggle/input/sem-eval-2025/public_data/dev/track_a/eng_a.csv'

# test_df = pd.read_csv(test_path)

# test_dataset = CustomDataset(test_df, bertweet_tokenizer, MAX_LEN)

# test_data_loader = torch.utils.data.DataLoader(

#     test_dataset,

#     batch_size=VALID_BATCH_SIZE,

#     shuffle=False,

#     num_workers=0

# )



# # Load the trained model

# model = BERTWEETClass()  # Replace with your actual model class

# checkpoint_path = '/kaggle/working/bertweet_best_model.pt'  # Replace with your actual checkpoint path

# model, optimizer, start_epoch, valid_loss_min = load_ckp(checkpoint_path, model, be_optimizer)

In [33]:
# # Define inference function

# def inference(testing_loader, model):

#     model.eval()

#     final_predictions = []

    

#     with torch.no_grad():

#         for _, data in enumerate(testing_loader, 0):

#             # Move data to the GPU

#             ids = data['input_ids'].to(device1, dtype=torch.long)

#             mask = data['attention_mask'].to(device1, dtype=torch.long)

#             token_type_ids = data['token_type_ids'].to(device1, dtype=torch.long)

            

#             # Ensure model is also on the same device (in this case, GPU)

#             model.to(device1)

            

#             # Get the model predictions

#             outputs = model(input_ids=ids, attn_mask=mask, token_type_ids=token_type_ids)

            

#             # Apply sigmoid to get probabilities

#             probs = torch.sigmoid(outputs).cpu().detach().numpy()  # Ensure outputs are moved to CPU for numpy compatibility

            

#             # Apply threshold to convert probabilities to binary predictions (0 or 1)

#             preds = (probs > 0.5).astype(int)

            

#             # Store predictions

#             final_predictions.extend(preds)

    

#     return np.array(final_predictions)





# # Perform inference on the test data

# predictions2 = inference(test_data_loader, model)

# predictions=(predictions1+predictions2)/2

# # Update the test_df with the predictions

# test_df[['Anger', 'Fear', 'Joy', 'Sadness', 'Surprise']] = predictions



# # Save the updated DataFrame to a CSV file
# test_df=test_df.drop(columns=['text'])
# # test_df.to_csv('pred_eng_a.csv', index=False)

# # print("Inference results saved to pred_eng_a.csv")