# Loading DataSet

In [103]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [104]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast

# specify GPU
device = torch.device("cuda")
torch.cuda.is_available()

True

In [105]:
device

device(type='cuda')

In [106]:
from datetime import datetime
parameters = {
    "num_class": 8,
    "time": str(datetime.now()).replace(" ", "_"),
    "seed": 1111,
    # Hyperparameters
    "model_name": 'BERT',
    "config": 'bert-base-uncased',
    "learning_rate": 1e-4,
    "epochs": 3,
    "max_len": 512,
    "batch_size": 16,
    "dropout": 0.1,
    "activation": 'Prelu',
    "hidden_dim": 384,
}

In [107]:
import pandas as pd
folder_name = 'dm-2024-isa-5810-lab-2-homework'
data_identification = pd.read_csv(folder_name + '/data_identification.csv')
emotion = pd.read_csv(folder_name + '/emotion.csv')
sample_submission = pd.read_csv(folder_name + '/sampleSubmission.csv')

print(data_identification)
print(data_identification.shape)
print(f"{'='*40}")
print(emotion)
print(emotion.shape)
print(f"{'='*40}")
print(sample_submission)
print(f"{'='*40}")

df_twitter = pd.read_json(folder_name + '/tweets_DM.json', lines=True)
train_ids = data_identification[data_identification['identification'] == 'train']['tweet_id'].tolist()
test_ids = data_identification[data_identification['identification'] == 'test']['tweet_id'].tolist()

print("Show ids of train and test\n")
print(len(train_ids))
print(len(test_ids))
print(len(train_ids) + len(test_ids))

df_twitter_expanded = pd.json_normalize(df_twitter['_source'])

print("After expand the tweet_id, tweet_hashtag...\n")
df_twitter['tweet_id'] = df_twitter_expanded['tweet.tweet_id']
df_twitter['text'] = df_twitter_expanded['tweet.text']
df_twitter['hash_tags'] = df_twitter_expanded['tweet.hashtags']

df_twitter_train = df_twitter[df_twitter['tweet_id'].isin(train_ids)]
df_twitter_test = df_twitter[df_twitter['tweet_id'].isin(test_ids)]

print("After saperate train and test:\n")
print(df_twitter_train.shape)
print(df_twitter_test.shape)

df_twitter_train = pd.merge(df_twitter_train, emotion, on='tweet_id', how='left')

         tweet_id identification
0        0x28cc61           test
1        0x29e452          train
2        0x2b3819          train
3        0x2db41f           test
4        0x2a2acc          train
...           ...            ...
1867530  0x227e25          train
1867531  0x293813          train
1867532  0x1e1a7e          train
1867533  0x2156a5          train
1867534  0x2bb9d2          train

[1867535 rows x 2 columns]
(1867535, 2)
         tweet_id       emotion
0        0x3140b1       sadness
1        0x368b73       disgust
2        0x296183  anticipation
3        0x2bd6e1           joy
4        0x2ee1dd  anticipation
...           ...           ...
1455558  0x38dba0           joy
1455559  0x300ea2           joy
1455560  0x360b99          fear
1455561  0x22eecf           joy
1455562  0x2fb282  anticipation

[1455563 rows x 2 columns]
(1455563, 2)
              id   emotion
0       0x2c7743  surprise
1       0x2c1eed  surprise
2       0x2826ea  surprise
3       0x356d9a  surprise
4  

### Use label encoder to turn word label into number

In [108]:
from sklearn.preprocessing import LabelEncoder

emotion_label = df_twitter_train['emotion']
label_encoder = LabelEncoder()
label_encoder.fit(emotion_label)
print('check label: ', label_encoder.classes_)

emotion_label = label_encoder.fit_transform(df_twitter_train['emotion'])

# 檢查轉換結果
print("Encoded Labels:", emotion_label[:4])
print("Mapping:", dict(zip(label_encoder.classes_, range(len(label_encoder.classes_)))))

df_twitter_train['label'] = emotion_label
df_twitter_train.head()

check label:  ['anger' 'anticipation' 'disgust' 'fear' 'joy' 'sadness' 'surprise'
 'trust']
Encoded Labels: [1 5 3 4]
Mapping: {'anger': 0, 'anticipation': 1, 'disgust': 2, 'fear': 3, 'joy': 4, 'sadness': 5, 'surprise': 6, 'trust': 7}


Unnamed: 0,_score,_index,_source,_crawldate,_type,tweet_id,text,hash_tags,emotion,label
0,391,hashtag_tweets,"{'tweet': {'hashtags': ['Snapchat'], 'tweet_id...",2015-05-23 11:42:47,tweets,0x376b20,"People who post ""add me on #Snapchat"" must be ...",[Snapchat],anticipation,1
1,433,hashtag_tweets,"{'tweet': {'hashtags': ['freepress', 'TrumpLeg...",2016-01-28 04:52:09,tweets,0x2d5350,"@brianklaas As we see, Trump is dangerous to #...","[freepress, TrumpLegacy, CNN]",sadness,5
2,376,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x1cd5...",2016-01-24 23:53:05,tweets,0x1cd5b0,Now ISSA is stalking Tasha 😂😂😂 <LH>,[],fear,3
3,120,hashtag_tweets,"{'tweet': {'hashtags': ['authentic', 'LaughOut...",2015-06-11 04:44:05,tweets,0x1d755c,@RISKshow @TheKevinAllison Thx for the BEST TI...,"[authentic, LaughOutLoud]",joy,4
4,1021,hashtag_tweets,"{'tweet': {'hashtags': [], 'tweet_id': '0x2c91...",2015-08-18 02:30:07,tweets,0x2c91a8,Still waiting on those supplies Liscus. <LH>,[],anticipation,1


In [109]:
train_data, val_data = train_test_split(df_twitter_train, random_state=42, test_size=0.2)

print(train_data.shape)
print(val_data.shape)

(1164450, 10)
(291113, 10)


In [110]:
train_data = pd.DataFrame({"text": train_data['text'], "label": train_data['label']})
val_data = pd.DataFrame({"text": val_data['text'], "label": val_data['label']})
print(train_data.head())
print(val_data.head())

                                                     text  label
834097  For those of you who have followed me, thanks ...      4
355739  When you have to take a day off from work in o...      5
625638  Wherever you are; be all there. That’s how <LH...      4
678647  @DesiMountaineer  Would be nice for all the PS...      4
441397  69 The moments in your life are only once #Lif...      7
                                                      text  label
970345   Been a #week now #since I <LH> my #Mom. I #mis...      6
1145883  Follow our Librarian, Ms. Bird 🐦 for more info...      4
468264   Wonder if the guys who skate in Foxboro over t...      4
949718   @vanillablack1 Bloody <LH> puts it mildly, wil...      4
982592   Beat the Dolphins next week and we are back to...      4


In [120]:
from transformers import AutoTokenizer
import transformers

config_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(config_name)

In [122]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
import torch
import torch.nn.functional as Fun

# Using Dataset to build DataLoader
class CustomDataset(Dataset):
    def __init__(self, mode, df, specify, args):
        assert mode in ["train", "val", "test"]  # 一般會切三份
        self.mode = mode
        self.df = df
        self.specify = specify # specify column of data (the column U use for predict)
        if self.mode != 'test':
          self.label = df['label']
        self.tokenizer = AutoTokenizer.from_pretrained(args["config"])
        self.max_len = args["max_len"]
        self.num_class = args["num_class"]
        
    def __len__(self):
        return len(self.df)
    
    # transform text to its number
    def tokenize(self,input_text):
        inputs = self.tokenizer.encode_plus(
            input_text,
            max_length = self.max_len,
            truncation = True,
            padding = 'max_length'
        )
        ids = inputs['input_ids'] # (512)
        mask = inputs['attention_mask'] # (512)
        token_type_ids = inputs["token_type_ids"] # (512)
        
        return ids, mask, token_type_ids

    # get single data
    def __getitem__(self, index):
        
        sentence = str(self.df[self.specify][index])
        ids, mask, token_type_ids = self.tokenize(sentence)

        if self.mode == "test":
            return torch.tensor(ids, dtype=torch.long), torch.tensor(mask, dtype=torch.long), \
                torch.tensor(token_type_ids, dtype=torch.long)
        else:
            return torch.tensor(ids, dtype=torch.long), torch.tensor(mask, dtype=torch.long), \
                torch.tensor(token_type_ids, dtype=torch.long), torch.tensor(self.label.iloc[index], dtype=torch.long)

In [123]:
import transformers
import pandas as pd

# load training data
# 你可以先 sample 部分資料去跑模型，有助於快速調整模型架構，畢竟資料愈多跑愈久
train_df = train_data.sample(4000, random_state=parameters['seed']).reset_index(drop=True)
train_dataset = CustomDataset('train', train_df, 'text', parameters)
train_loader = DataLoader(train_dataset, batch_size=parameters['batch_size'], shuffle=True)

# load validation data
val_df = val_data.sample(500, random_state=parameters['seed']).reset_index(drop=True)
val_dataset = CustomDataset('val', val_df, 'text', parameters)
val_loader = DataLoader(val_dataset, batch_size=parameters['batch_size'], shuffle=True)

In [124]:
import torch.nn as nn
import copy

# define different activation function
def get_activation(activation):
    if activation == 'Prelu':
        return nn.PReLU()
    elif activation == 'relu':
        return nn.ReLU()
    elif activation == 'sigmoid':
        return nn.Sigmoid()
    elif activation == 'gelu':
        return nn.GELU()
    elif activation == 'LeakyReLU':
        return nn.LeakyReLU()
    else:
        return nn.Tanh()
# Dense Layer
# It is composed of linear, dropout, and activation layers.
class Dense(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate, activation='tanh'):
        super(Dense, self).__init__()
        self.hidden_layer = nn.Linear(input_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = get_activation(activation) # default tanh
        nn.init.xavier_uniform_(self.hidden_layer.weight) # you also can change the initialize method
    def forward(self, inputs):
        logits = self.hidden_layer(inputs)
        logits = self.dropout(logits)
        logits = self.activation(logits)
        return logits
# multi-layers
def _get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
# Hidden Layers
# It means there are many dense layers with the same dimension
class HiddenLayers(nn.Module):
    def __init__(self, dense_layer, num_layers):
        super(HiddenLayers, self).__init__()
        self.hidden_layers = _get_clones(dense_layer, num_layers)
    def forward(self, output):
        for layer in self.hidden_layers:
            output = layer(output)
        return output

In [125]:
from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel
import torch.nn as nn

# BERT Model
class BertClassifier(BertPreTrainedModel):
    def __init__(self, config, args):
        super(BertClassifier, self).__init__(config)
        self.bert = BertModel(config)
        self.num_labels = args["num_class"]
        self.dense = Dense(config.hidden_size, args["hidden_dim"], args["dropout"], args["activation"])
        self.classifier = Dense(args["hidden_dim"], self.num_labels, args["dropout"], args["activation"])
        self.init_weights()
    # forward function, data in the model will do this
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None,
                head_mask=None, inputs_embeds=None, labels=None, output_attentions=None,
                output_hidden_states=None, return_dict=None):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        # bert output
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict
        )
        '''
        outputs.keys() -> odict_keys(['last_hidden_state', 'pooler_output'])
        outs.last_hidden_state.shape -> torch.Size([batch_size, 512, 768])
        outs.pooler_output.shape -> torch.Size([batch_size, 768])
        '''
        # get its [CLS] logits
        pooled_output = outputs[1] # (batch_size, 768)
        # add dense layer
        pooled_output = self.dense(pooled_output) # (batch_size, 384)
        # add linear classifier
        logits = self.classifier(pooled_output) # (batch_size, 2)
        return logits

In [126]:
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score# get predict result

def get_pred(logits):
    y_pred = torch.argmax(logits, dim = 1)
    return y_pred

# calculate confusion metrics
def cal_metrics(pred, ans, method):
    '''
    Parameter
    ---------
    pred: [list], predict class
    ans: [list], true class
    method: 'micro', 'weighted', 'macro'. # 如果有多分類的話計算上會有差別
    ---------
    '''
    if pred.get_device() != 'cpu':
        pred = pred.detach().cpu().numpy()
    if ans.get_device() != 'cpu':
        ans = ans.detach().cpu().numpy()
    # 將 zero_division 設為 0，表示當所有預測皆錯誤時，將結果視為 0 
    rec = recall_score(pred, ans, average=method, zero_division=0)
    f1 = f1_score(pred, ans, average=method, zero_division=0)
    prec = precision_score(pred, ans, average=method, zero_division=0)
    acc = accuracy_score(pred, ans)
    return acc, f1, rec, prec

In [127]:
import torch

model = BertClassifier.from_pretrained(parameters['config'], parameters).to(device)
loss_fct = nn.ECrossntropyLoss() # we use cross entrophy loss

## You can custom your optimizer (e.g. SGD .etc) ##
# we use Adam here
optimizer = torch.optim.Adam(model.parameters(), lr=parameters['learning_rate'], betas=(0.9, 0.999), eps=1e-9)

Some weights of BertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.activation.weight', 'classifier.hidden_layer.bias', 'classifier.hidden_layer.weight', 'dense.activation.weight', 'dense.hidden_layer.bias', 'dense.hidden_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
import torch.nn as nn
# evaluate dataloader
def evaluate(model, data_loader, device):
    val_loss, val_acc, val_f1, val_rec, val_prec = 0.0, 0.0, 0.0, 0.0, 0.0
    step_count = 0
    loss_fct = nn.CrossEntropyLoss()
    model.eval()
    with torch.no_grad():
        for data in data_loader:
            ids, masks, token_type_ids, labels = [t.to(device) for t in data]

            logits = model(input_ids = ids,
                    token_type_ids = token_type_ids,
                    attention_mask = masks)
            acc, f1, rec, prec = cal_metrics(get_pred(logits), labels, 'macro')
            loss = loss_fct(logits, labels) # 直接丟就好，不用特意做轉換（但如果非二分類，需考慮 one-hot 標籤的轉換）

            val_loss += loss.item()
            val_acc += acc
            val_f1 += f1
            val_rec += rec
            val_prec += prec
            step_count+=1

        val_loss = val_loss / step_count
        val_acc = val_acc / step_count
        val_f1 = val_f1 / step_count
        val_rec = val_rec / step_count
        val_prec = val_prec / step_count

    return val_loss, val_acc, val_f1, val_rec, val_prec

In [None]:
# save model to path
def save_checkpoint(save_path, model):
    if save_path == None:
        return
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to ==> {save_path}')

# load model from path
def load_checkpoint(load_path, model, device):
    if load_path==None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f'\nModel loaded from <== {load_path}')

    model.load_state_dict(state_dict)
    return model

In [None]:
import time
def train(model, train_loader, val_loader, optimizer, args, device):

    metrics = ['loss', 'acc', 'f1', 'rec', 'prec']
    mode = ['train_', 'val_']
    record = {s+m :[] for s in mode for m in metrics}

    loss_fct = nn.CrossEntropyLoss()

    for epoch in range(args["epochs"]):

        st_time = time.time()
        train_loss, train_acc, train_f1, train_rec, train_prec = 0.0, 0.0, 0.0, 0.0, 0.0
        step_count = 0

        model.train()
        for data in train_loader:

            ids, masks, token_type_ids, labels = [t.to(device) for t in data]

            optimizer.zero_grad()
    
            logits = model(input_ids = ids,
                    token_type_ids = token_type_ids,
                    attention_mask = masks)

            acc, f1, rec, prec = cal_metrics(get_pred(logits), labels, 'macro')
            loss = loss_fct(logits, labels)

            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_acc += acc
            train_f1 += f1
            train_rec += rec
            train_prec += prec
            step_count += 1

        val_loss, val_acc, val_f1, val_rec, val_prec = evaluate(model, val_loader, device)

        train_loss = train_loss / step_count
        train_acc = train_acc / step_count
        train_f1 = train_f1 / step_count
        train_rec = train_rec / step_count
        train_prec = train_prec / step_count

        print('[epoch %d] cost time: %.4f s'%(epoch + 1, time.time() - st_time))
        print('         loss     acc     f1      rec    prec')
        print('train | %.4f, %.4f, %.4f, %.4f, %.4f'%(train_loss, train_acc, train_f1, train_rec, train_prec))
        print('val  | %.4f, %.4f, %.4f, %.4f, %.4f\n'%(val_loss, val_acc, val_f1, val_rec, val_prec))

        # record training metrics of each training epoch
        record['train_loss'].append(train_loss)
        record['train_acc'].append(train_acc)
        record['train_f1'].append(train_f1)
        record['train_rec'].append(train_rec)
        record['train_prec'].append(train_prec)
    
        record['val_loss'].append(val_loss)
        record['val_acc'].append(val_acc)
        record['val_f1'].append(val_f1)
        record['val_rec'].append(val_rec)
        record['val_prec'].append(val_prec)

    # save model
    save_checkpoint(args["model_name"] + '_' + args["time"].split('_')[0] + '.pt', model)

    return record

In [None]:
import matplotlib.pyplot as plt

# draw the learning curve
def draw_pic(record, name, img_save=False, show=False):
    x_ticks = range(1, parameters['epochs']+1)

    plt.figure(figsize=(6, 3))

    plt.plot(x_ticks, record['train_'+name], '-o', color='lightskyblue',
             markeredgecolor="teal", markersize=3, markeredgewidth=1, label = 'Train')
    plt.plot(x_ticks, record['val_'+name], '-o', color='pink',
             markeredgecolor="salmon", markersize=3, markeredgewidth=1, label = 'Val')
    plt.grid(color='lightgray', linestyle='--', linewidth=1)

    plt.title('Model', fontsize=14)
    plt.ylabel(name, fontsize=12)
    plt.xlabel('Epoch', fontsize=12)
    plt.xticks(x_ticks, fontsize=12)
    plt.yticks(fontsize=12)
    plt.legend(loc='lower right' if not name.lower().endswith('loss') else 'upper right')

    # define saved figure or not
    if img_save:
        plt.savefig(name+'.png', transparent=False, dpi=300)
    if show:
        plt.show()

    plt.close()

In [None]:
history = train(model, train_loader, val_loader, optimizer, parameters, device)

# draw all metrics figure
draw_pic(history, 'loss', img_save=True, show=False)
draw_pic(history, 'acc', img_save=True, show=False)
draw_pic(history, 'f1', img_save=True, show=False)
draw_pic(history, 'rec', img_save=True, show=False)
draw_pic(history, 'prec', img_save=True, show=False)

files = []
files.append('loss.png')
files.append('acc.png')
files.append('f1.png')
files.append('rec.png')
files.append('prec.png')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
def Softmax(x):
    return torch.exp(x) / torch.exp(x).sum()

# predict a single sentence
def predict_one(query, model):

  tokenizer = AutoTokenizer.from_pretrained(parameters['config'])
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model.eval()
  with torch.no_grad():
    inputs = tokenizer.encode_plus(
            query,
            max_length = parameters['max_len'],
            truncation = True,
            padding = 'max_length',
            return_tensors = 'pt'
        )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    token_type_ids = inputs["token_type_ids"].to(device)

    # forward pass
    logits = model(input_ids, attention_mask, token_type_ids)
    probs = Softmax(logits) # get each class-probs
    label_index = torch.argmax(probs[0], dim=0)
    pred = label_index.item()

  return probs, pred

In [None]:
# You can load the model from the existing result
init_model = BertClassifier.from_pretrained(parameters['config'], parameters) # build an initial model
model = load_checkpoint('./bert.pt', init_model, device).to(device) # and load the weight of model from specify file

Some weights of BertClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.activation.weight', 'classifier.hidden_layer.bias', 'classifier.hidden_layer.weight', 'dense.activation.weight', 'dense.hidden_layer.bias', 'dense.hidden_layer.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  state_dict = torch.load(load_path, map_location=device)


FileNotFoundError: [Errno 2] No such file or directory: './bert.pt'