In [1]:
import pandas as pd
import torch
from torch import nn
import numpy as np
from transformers import AutoTokenizer,AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
import random
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data = pd.read_csv('dataset/train.tsv', sep='\t')
val_data = pd.read_csv('dataset/val.tsv', sep='\t')
test_data = pd.read_csv('dataset/test.tsv', sep='\t')
train_data = pd.concat([train_data, val_data], axis=0)
train_data

Unnamed: 0,id,source,utterance,classes
0,356,6,Start by reading the preliminary information,OTHER
1,357,6,"""Hello Ms. Klein, I am responsible for you in ...",OTHER
2,358,6,Since yesterday afternoon?,OTHER
3,359,6,"Hmm, but the shortness of breath only came whi...",OTHER
4,360,6,Have you ever laid down during the day when yo...,OTHER
...,...,...,...,...
402,1526,23,Is this the right career for you? Do you have ...,SF
403,1527,23,Your nightmares aren't about your work either?...,OTHER
404,1528,23,But that started at some point? So it's been m...,OTHER
405,1529,23,Are there any illnesses in your family?,SF


In [3]:
num_classes = 6 


model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tag_list=["AM","MS","OTHER","PH","SF","SR"]
tag_to_idx={}
for i in range(len(tag_list)):
    tag_to_idx[tag_list[i]]=i
    
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        utterance = str(self.data.iloc[idx]['utterance'])
        labels = self.data.iloc[idx]['classes'] 
        labels = self.label_encoded(labels)
        encoding = self.tokenizer.encode_plus(
            utterance,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float32)
        }
    def label_encoded(self,labels):
        encode_label = np.zeros(6,dtype=int)

        list_label = labels.split(",")
        for i in list_label:
            idx = tag_to_idx[i]
            encode_label[idx] = 1
        # encode_label = torch.LongTensor(encode_label)
        return encode_label 

# 設定一些超參數
max_len = 256
batch_size = 8

# 建立訓練、驗證和測試數據集
train_dataset = CustomDataset(train_data, tokenizer, max_len)
val_dataset = CustomDataset(val_data, tokenizer, max_len)


# 使用DataLoader加載數據
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)





In [4]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
Device name: NVIDIA GeForce RTX 4090


In [5]:
from transformers import  get_linear_schedule_with_warmup
from torch.optim import AdamW 
def init_model(model,epochs = 4):
    num_classes = 6 
    model.to(device)
    # optimizer = AdamW(model.parameters(),lr=5e-5,eps=1e-8)
    optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=500,num_training_steps=total_steps)

    return model , optimizer , scheduler

In [6]:
loss_fn = nn.BCEWithLogitsLoss()
def evalute(dataloader,model):
    model.eval()
    y_pred = []
    y_target = []
    with torch.no_grad():
        for step, batch in enumerate(dataloader):
            b_input_ids = batch['input_ids'].to(device)
            b_attn_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            logits = model(b_input_ids, b_attn_mask)
            y_pred.extend(torch.sigmoid(logits.logits).cpu().detach().numpy().tolist())         
            y_target.extend(b_labels.cpu().detach().numpy().tolist())
    y_preds = (np.array(y_pred)>0.5).astype(int)
    marco_f1= f1_score(y_target,y_preds,average='macro')
    # print("marco f1 score : ",marco_f1)
    return marco_f1
def train(model,train_dataloader,val_dataloader,optimizer,scheduler,path,epochs,evaluation):
    print("Start training...\n")
    max_score = 0
    for epoch_i in range(epochs):

        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'F1 score':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch_counts +=1

            b_input_ids = batch['input_ids'].to(device)
            b_attn_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            outputs = model(b_input_ids, attention_mask=b_attn_mask, labels=b_labels)
            logits = outputs.logits
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()    
            model.zero_grad()
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        avg_train_loss = total_loss / len(train_dataloader)
        
        print("-"*70)
        if evaluation == True:
            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            score = evalute(val_dataloader,model)
            if score > max_score:
                # path = "w_weight.pth"
                torch.save(model.state_dict(),path)
                # print('save model')
                max_score = score
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {'-':^10} | {score:^9.6f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")
    print("best score: ",max_score)



In [7]:
model_list = []
for i in range(5):
    epochs = 30

    model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_classes)
    bert_classifier, optimizer, scheduler = init_model(model,epochs=epochs)
    path = f"w_weight{i}.pth"
    train(bert_classifier, train_dataloader, val_dataloader,optimizer,scheduler ,path,epochs=epochs, evaluation=True)
    model_list.append(model)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | F1 score  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.763474   |     -      |     -     |   4.51   
   1    |   40    |   0.708146   |     -      |     -     |   4.08   
   1    |   60    |   0.563215   |     -      |     -     |   4.08   
   1    |   80    |   0.449957   |     -      |     -     |   4.07   
   1    |   100   |   0.402523   |     -      |     -     |   4.07   
   1    |   120   |   0.404912   |     -      |     -     |   4.08   
   1    |   140   |   0.392533   |     -      |     -     |   4.08   
   1    |   160   |   0.379983   |     -      |     -     |   4.08   
   1    |   180   |   0.364242   |     -      |     -     |   4.08   
   1    |   200   |   0.349820   |     -      |     -     |   4.09   
   1    |   220   |   0.370013   |     -      |     -     |   4.08   
   1    |   231   |   0.381131   |     -      |     -     |   2.19   


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | F1 score  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.709685   |     -      |     -     |   4.22   
   1    |   40    |   0.652686   |     -      |     -     |   4.06   
   1    |   60    |   0.488382   |     -      |     -     |   4.06   
   1    |   80    |   0.428442   |     -      |     -     |   4.07   
   1    |   100   |   0.391628   |     -      |     -     |   4.07   
   1    |   120   |   0.368286   |     -      |     -     |   4.07   
   1    |   140   |   0.351563   |     -      |     -     |   4.07   
   1    |   160   |   0.379839   |     -      |     -     |   4.06   
   1    |   180   |   0.353398   |     -      |     -     |   4.07   
   1    |   200   |   0.362540   |     -      |     -     |   4.07   
   1    |   220   |   0.336318   |     -      |     -     |   4.06   
   1    |   231   |   0.313325   |     -      |     -     |   2.17   


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | F1 score  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.652002   |     -      |     -     |   4.21   
   1    |   40    |   0.593110   |     -      |     -     |   4.07   
   1    |   60    |   0.477774   |     -      |     -     |   4.07   
   1    |   80    |   0.383189   |     -      |     -     |   4.07   
   1    |   100   |   0.393653   |     -      |     -     |   4.07   
   1    |   120   |   0.372386   |     -      |     -     |   4.06   
   1    |   140   |   0.357692   |     -      |     -     |   4.07   
   1    |   160   |   0.380396   |     -      |     -     |   4.06   
   1    |   180   |   0.362771   |     -      |     -     |   4.07   
   1    |   200   |   0.355239   |     -      |     -     |   4.07   
   1    |   220   |   0.376050   |     -      |     -     |   4.07   
   1    |   231   |   0.319801   |     -      |     -     |   2.17   


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | F1 score  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.677252   |     -      |     -     |   4.20   
   1    |   40    |   0.633619   |     -      |     -     |   4.07   
   1    |   60    |   0.505932   |     -      |     -     |   4.07   
   1    |   80    |   0.437085   |     -      |     -     |   4.08   
   1    |   100   |   0.387434   |     -      |     -     |   4.14   
   1    |   120   |   0.367115   |     -      |     -     |   4.07   
   1    |   140   |   0.378621   |     -      |     -     |   4.07   
   1    |   160   |   0.385137   |     -      |     -     |   4.08   
   1    |   180   |   0.385779   |     -      |     -     |   4.06   
   1    |   200   |   0.352034   |     -      |     -     |   4.07   
   1    |   220   |   0.381421   |     -      |     -     |   4.07   
   1    |   231   |   0.346924   |     -      |     -     |   2.17   


Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | F1 score  |  Elapsed 
----------------------------------------------------------------------
   1    |   20    |   0.678093   |     -      |     -     |   4.26   
   1    |   40    |   0.627473   |     -      |     -     |   4.06   
   1    |   60    |   0.511726   |     -      |     -     |   4.07   
   1    |   80    |   0.413163   |     -      |     -     |   4.08   
   1    |   100   |   0.413339   |     -      |     -     |   4.07   
   1    |   120   |   0.358356   |     -      |     -     |   4.07   
   1    |   140   |   0.324625   |     -      |     -     |   4.07   
   1    |   160   |   0.371440   |     -      |     -     |   4.06   
   1    |   180   |   0.368297   |     -      |     -     |   4.06   
   1    |   200   |   0.331010   |     -      |     -     |   4.07   
   1    |   220   |   0.369087   |     -      |     -     |   4.06   
   1    |   231   |   0.364512   |     -      |     -     |   2.17   


In [8]:
def essem_evalute(dataloader,model_list):
    model0 = model_list[0]
    model1 = model_list[1]
    model2 = model_list[2]
    model3 = model_list[3]
    model4 = model_list[4]
    model0.eval()
    model1.eval()
    model2.eval()
    model3.eval()
    model4.eval()
    y_pred = []
    y_target = []
    with torch.no_grad():
        for step, batch in enumerate(dataloader):
            b_input_ids = batch['input_ids'].to(device)
            b_attn_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            logits0 = model0(b_input_ids, b_attn_mask)
            logits1 = model1(b_input_ids, b_attn_mask)
            logits2 = model2(b_input_ids, b_attn_mask)
            logits3 = model3(b_input_ids, b_attn_mask)
            logits4 = model4(b_input_ids, b_attn_mask)
            tmp = torch.sigmoid(logits0.logits)+torch.sigmoid(logits1.logits)+torch.sigmoid(logits2.logits)+torch.sigmoid(logits3.logits)+torch.sigmoid(logits4.logits)
            tmp = tmp / 5
            y_pred.extend(tmp.cpu().detach().numpy().tolist())         
            y_target.extend(b_labels.cpu().detach().numpy().tolist())
    y_preds = (np.array(y_pred)>0.5).astype(int)
    marco_f1= f1_score(y_target,y_preds,average='macro')
    # print("marco f1 score : ",marco_f1)
    return marco_f1
print(essem_evalute(val_dataloader,model_list))

0.9851981644803415


In [9]:
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ids = self.data.iloc[idx]['id']
        utterance = str(self.data.iloc[idx]['utterance'])
        encoding = self.tokenizer.encode_plus(
            utterance,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'id' : ids,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
test_dataset = TestDataset(test_data, tokenizer, max_len)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [10]:
prdict_model_list=[]
for i in range(5):
    path = f'./w_weight{i}.pth'
    bert_classifier = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=num_classes).to(device)
    bert_classifier.load_state_dict(torch.load(path))
    prdict_model_list.append(bert_classifier)
    bert_classifier.eval()
y_pred = []
ids = []
model0 = prdict_model_list[0]
model1 = prdict_model_list[1]
model2 = prdict_model_list[2]
model3 = prdict_model_list[3]
model4 = prdict_model_list[4]
with torch.no_grad():
    for batch  in test_dataloader:
        id_0 = batch['id'].cpu().item()
        ids.append(id_0)
        b_input_ids = batch['input_ids'].to(device)
        b_attn_mask = batch['attention_mask'].to(device)
        logits0 = model0(b_input_ids, b_attn_mask)
        logits1 = model1(b_input_ids, b_attn_mask)
        logits2 = model2(b_input_ids, b_attn_mask)
        logits3 = model3(b_input_ids, b_attn_mask)
        logits4 = model4(b_input_ids, b_attn_mask)
        tmp = torch.sigmoid(logits0.logits)+torch.sigmoid(logits1.logits)+torch.sigmoid(logits2.logits)+torch.sigmoid(logits3.logits)+torch.sigmoid(logits4.logits)
        tmp = tmp / 5
        y_pred.extend(tmp.cpu().detach().numpy().tolist())  

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
y_preds = (np.array(y_pred) > 0.5).astype(int)
y_preds

array([[0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1]])

In [12]:
df = pd.DataFrame(y_preds, columns=tag_list)
df_id =pd.DataFrame(ids, columns=["id"])
merged_df = pd.concat([df_id, df], axis=1)
merged_df

Unnamed: 0,id,AM,MS,OTHER,PH,SF,SR
0,1384,0,0,1,0,0,0
1,1385,0,0,1,0,0,0
2,1386,0,0,1,0,0,0
3,1387,0,0,1,0,0,0
4,1388,0,1,0,0,0,0
...,...,...,...,...,...,...,...
622,1843,0,0,0,0,1,0
623,1844,0,0,0,0,1,0
624,1845,0,0,0,0,0,1
625,1846,0,0,0,0,0,1


In [13]:
data_rows = merged_df.to_dict(orient='records')

In [14]:
import csv
with open("submission.csv", 'w', newline='') as csvfile:
    fieldnames=["id","AM","MS","OTHER","PH","SF","SR"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data_rows)