In [105]:
import pandas as pd
import torch
from torch import nn
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification,BertModel
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
import random
import time

In [106]:
train_data = pd.read_csv('dataset/train.tsv', sep='\t')
val_data = pd.read_csv('dataset/val.tsv', sep='\t')
test_data = pd.read_csv('dataset/test.tsv', sep='\t')

print(train_data.head())

    id  source                                          utterance classes
0  356       6       Start by reading the preliminary information   OTHER
1  357       6  "Hello Ms. Klein, I am responsible for you in ...   OTHER
2  358       6                         Since yesterday afternoon?   OTHER
3  359       6  Hmm, but the shortness of breath only came whi...   OTHER
4  360       6  Have you ever laid down during the day when yo...   OTHER


In [107]:
num_classes = 6 


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)
tag_list=["AM","MS","OTHER","PH","SF","SR"]
tag_to_idx={}
for i in range(len(tag_list)):
    tag_to_idx[tag_list[i]]=i
    
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        utterance = str(self.data.iloc[idx]['utterance'])
        labels = self.data.iloc[idx]['classes'] 
        labels = self.label_encoded(labels)
        encoding = self.tokenizer.encode_plus(
            utterance,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.float32)
        }
    def label_encoded(self,labels):
        encode_label = np.zeros(6,dtype=int)

        list_label = labels.split(",")
        for i in list_label:
            idx = tag_to_idx[i]
            encode_label[idx] = 1
        # encode_label = torch.LongTensor(encode_label)
        return encode_label 

# 設定一些超參數
max_len = 128
batch_size = 32

# 建立訓練、驗證和測試數據集
train_dataset = CustomDataset(train_data, tokenizer, max_len)
val_dataset = CustomDataset(val_data, tokenizer, max_len)


# 使用DataLoader加載數據
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)



In [108]:
import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
    
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
Device name: NVIDIA GeForce RTX 4090


In [109]:
class Bert_classifier(nn.Module):
    def __init__(self,num_classes,freeze_bert=False) :
        super(Bert_classifier,self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased',return_dict=False)

        self.dropout = nn.Dropout(0.3)
        self.hidden = nn.Linear(768,300)
        # self.hidden2 = nn.Linear(486,300)
        self.out = nn.Linear(300,num_classes)

        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    
    def forward(self,input_ids,attention_mask):
        sequence_output , pooled_output = self.bert(input_ids = input_ids,attention_mask = attention_mask)
        pooled_output =  self.dropout(pooled_output)
        hidden = self.hidden(pooled_output)
        # hidden = self.hidden2(hidden)
        logits = self.out(hidden)

        return logits



In [110]:
from transformers import AdamW, get_linear_schedule_with_warmup

def init_model(epochs = 4):
    num_classes = 6 
    model = Bert_classifier(num_classes)
    model.to(device)
    # optimizer = AdamW(model.parameters(),lr=5e-5,eps=1e-8)
    optimizer = AdamW(model.parameters(),lr=5e-5,eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,num_training_steps=total_steps)

    return model , optimizer , scheduler

In [111]:
loss_fn = nn.BCEWithLogitsLoss()
def evalute(dataloader,model):
    model.eval()
    y_pred = []
    y_target = []
    with torch.no_grad():
        for step, batch in enumerate(dataloader):
            b_input_ids = batch['input_ids'].to(device)
            b_attn_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            logits = model(b_input_ids, b_attn_mask)
            y_pred.extend(torch.sigmoid(logits).cpu().detach().numpy().tolist())         
            y_target.extend(b_labels.cpu().detach().numpy().tolist())
    y_preds = (np.array(y_pred)>0.5).astype(int)
    marco_f1= f1_score(y_target,y_preds,average='macro')
    # print("marco f1 score : ",marco_f1)
    return marco_f1
def train(model,train_dataloader,val_dataloader,optimizer,scheduler,epochs,evaluation):
    print("Start training...\n")
    for epoch_i in range(epochs):

        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'F1 score':^9} | {'Elapsed':^9}")
        print("-"*70)

        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0
        for step, batch in enumerate(train_dataloader):
            model.train()
            batch_counts +=1

            b_input_ids = batch['input_ids'].to(device)
            b_attn_mask = batch['attention_mask'].to(device)
            b_labels = batch['labels'].to(device)
            model.zero_grad()
            logits = model(b_input_ids, b_attn_mask)
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            if (step % 20 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        avg_train_loss = total_loss / len(train_dataloader)

        print("-"*70)
        if evaluation == True:
    

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            score = evalute(val_dataloader,model)
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {'-':^10} | {score:^9.6f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")



In [112]:
epochs = 15
bert_classifier, optimizer, scheduler = init_model(epochs=epochs)
train(bert_classifier, train_dataloader, val_dataloader,optimizer,scheduler ,epochs=epochs, evaluation=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Start training...

 Epoch  |  Batch  |  Train Loss  |  Val Loss  | F1 score  |  Elapsed 
----------------------------------------------------------------------


TypeError: dropout(): argument 'input' (position 1) must be Tensor, not tuple

In [None]:
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        ids = self.data.iloc[idx]['id']
        utterance = str(self.data.iloc[idx]['utterance'])
        encoding = self.tokenizer.encode_plus(
            utterance,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'id' : ids,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
test_dataset = TestDataset(test_data, tokenizer, max_len)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
bert_classifier.eval()
y_pred = []
ids = []
with torch.no_grad():
    for batch  in test_dataloader:
        id_0 = batch['id'].cpu().item()
        ids.append(id_0)
        b_input_ids = batch['input_ids'].to(device)
        b_attn_mask = batch['attention_mask'].to(device)
        logits = bert_classifier(b_input_ids, b_attn_mask)
        y_pred.extend(torch.sigmoid(logits).cpu().detach().numpy().tolist())

In [None]:
y_preds = (np.array(y_pred) > 0.5).astype(int)
y_preds

array([[0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       ...,
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0]])

In [None]:
df = pd.DataFrame(y_preds, columns=tag_list)
df_id =pd.DataFrame(ids, columns=["id"])
merged_df = pd.concat([df_id, df], axis=1)
merged_df

Unnamed: 0,id,AM,MS,OTHER,PH,SF,SR
0,1384,0,0,1,0,0,0
1,1385,0,0,1,0,0,0
2,1386,0,0,1,0,0,0
3,1387,0,0,1,0,0,0
4,1388,0,1,0,0,0,0
...,...,...,...,...,...,...,...
622,1843,0,0,1,0,0,0
623,1844,0,0,0,0,1,0
624,1845,0,0,1,0,0,0
625,1846,0,0,1,0,0,0


In [None]:
data_rows = merged_df.to_dict(orient='records')

In [None]:
import csv
with open("submission.csv", 'w', newline='') as csvfile:
    fieldnames=["id","AM","MS","OTHER","PH","SF","SR"]
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data_rows)