In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

from datasets import load_dataset
import random
from sklearn import metrics
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset

import transformers
from transformers import AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MAX_LENGTH  = 512
TRAIN_RATIO = 0.85
VAL_RATIO   = 0.15

BATCH_SIZE  = 16 
LEARNING_RATE = 2e-5
EPOCHS = 5
N_LABELS = 28

In [3]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    # Some cudnn methods can be random even after fixing the seed 
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything(1234)

In [4]:
go_emotions = load_dataset("go_emotions")
data = go_emotions.data

train = data["train"].to_pandas()
valid = data["validation"].to_pandas()
test = data["test"].to_pandas()

print(train.shape, valid.shape, test.shape)
# (43410, 3) (5426, 3) (5427, 3)

train.head()

No config specified, defaulting to: go_emotions/simplified
Found cached dataset go_emotions (C:/Users/bmarquescost/.cache/huggingface/datasets/go_emotions/simplified/0.0.0/2637cfdd4e64d30249c3ed2150fa2b9d279766bfcd6a809b9f085c61a90d776d)
100%|██████████| 3/3 [00:00<00:00, 69.77it/s]

(43410, 3) (5426, 3) (5427, 3)





Unnamed: 0,text,labels,id
0,My favourite food is anything I didn't have to...,[27],eebbqej
1,"Now if he does off himself, everyone will thin...",[27],ed00q6i
2,WHY THE FUCK IS BAYLESS ISOING,[2],eezlygj
3,To make her feel threatened,[14],ed7ypvh
4,Dirty Southern Wankers,[3],ed0bdzj


In [5]:
def one_hot_encoder(df):
    one_hot_encoding = []
    for i in tqdm(range(len(df))):
        temp = [0] * N_LABELS
        label_indices = df.iloc[i]["labels"]
        for index in label_indices:
            temp[index] = 1
        one_hot_encoding.append(temp)
        
    return pd.DataFrame(one_hot_encoding)

train_ohe_labels = one_hot_encoder(train)
valid_ohe_labels = one_hot_encoder(valid)
test_ohe_labels = one_hot_encoder(test)

print(train_ohe_labels.shape)
#(43410, 28)

train = pd.concat([train, train_ohe_labels], axis=1)
valid = pd.concat([valid, valid_ohe_labels], axis=1)
test = pd.concat([test, test_ohe_labels], axis=1)

100%|██████████| 43410/43410 [00:01<00:00, 26812.84it/s]
100%|██████████| 5426/5426 [00:00<00:00, 26861.46it/s]
100%|██████████| 5427/5427 [00:00<00:00, 28119.51it/s]

(43410, 28)





In [6]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        self.len = len(labels)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()} 
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [7]:
def build_dataset(train, valid, tokenizer):
    train_tokenized = tokenizer(train['text'].to_list(), return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH)
    valid_tokenized = tokenizer(valid['text'].to_list(), return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH)

    train_dataset = TextDataset(train_tokenized, train[range(N_LABELS)].values.tolist())
    valid_dataset = TextDataset(valid_tokenized, valid[range(N_LABELS)].values.tolist())
    
    return train_dataset, valid_dataset

def build_dataloader(train_dataset, valid_dataset):
    train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    valid_data_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=True)

    return train_data_loader, valid_data_loader

def get_model(base):
  model = AutoModelForSequenceClassification.from_pretrained(base, num_labels=N_LABELS)
  
  return model

In [8]:
def get_optimizer(model):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    opt = AdamW(optimizer_parameters, lr=LEARNING_RATE)
    return opt

In [9]:
def get_scheduler(optimizer, num_train_steps):
    sch = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return sch

In [10]:
def loss_fn(logits, labels):
    if labels is None:
        return None
    
    loss_function = nn.BCEWithLogitsLoss()
    return loss_function(logits.view(-1, len(labels)), labels.type_as(logits).view(-1, len(labels)))

In [11]:
def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()
    
    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    auc_micro = metrics.auc(fpr_micro, tpr_micro)
    
    return {"auc_micro": auc_micro}

In [12]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    running_train_loss = 0.0
    
    model.train()
    bar = tqdm(total=len(data_loader), desc=f'Training', unit="steps", position=0, leave=False)

    for batch_id, batch in enumerate(data_loader):
        ids, mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']
        
        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        labels = labels.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(input_ids=ids, attention_mask=mask)
        
        logits = outputs.logits

        loss = loss_fn(logits, labels)
        loss.backward()

        running_train_loss += loss.item()
        
        optimizer.step()
        scheduler.step()
        
        bar.update(1)

    return running_train_loss
    

def eval_fn(data_loader, model, device):
    eval_loss = 0.0
    model.eval()
    
    fin_labels = []
    fin_outputs = []
   
    bar = tqdm(total=len(data_loader), desc=f'Validation', unit="steps", position=0, leave=False)

    with torch.no_grad():
        for batch_id, batch in enumerate(data_loader):
            ids, mask, labels = batch['input_ids'], batch['attention_mask'], batch['labels']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)

            outputs = model(input_ids=ids, attention_mask=mask)

            logits = outputs.logits

            loss = loss_fn(logits, labels)
            eval_loss += loss.item()
            
            fin_labels.extend(labels)
            fin_outputs.extend(torch.sigmoid(logits))

            bar.update(1)

    return eval_loss, fin_outputs, fin_labels

In [13]:
def trainer():
    base = 'distilbert-base-uncased'
    tokenizer = transformers.AutoTokenizer.from_pretrained(base)

    train_dataset, valid_dataset = build_dataset(tokenizer)
    train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset)
    
    print("Length of Train Dataloader: ", len(train_data_loader))
    print("Length of Valid Dataloader: ", len(valid_data_loader))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    n_train_steps = int(len(train_dataset) / BATCH_SIZE * 10)

    model = get_model(base)
    optimizer = get_optimizer(model)
    scheduler = get_scheduler(optimizer, n_train_steps)
    
    model.to(device)

    log = {}

    best_val_loss = 100
    print('Training model')
    
    for epoch in tqdm(range(EPOCHS)):
        train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
        eval_loss, preds, labels = eval_fn(valid_data_loader, model, device)
        
        auc_score = log_metrics(preds, labels)["auc_micro"]
        print("AUC score: ", auc_score)
        avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(valid_data_loader)
        
        log[epoch + 1] = {
            "train_loss": avg_train_loss,
            "val_loss": avg_val_loss,
            "auc_score": auc_score,
        }

        print("Average Train loss: ", avg_train_loss)
        print("Average Valid loss: ", avg_val_loss)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "./best_model.pt")  
            print("Model saved as current val_loss is: ", best_val_loss)  
    
    return log

In [14]:
base = 'distilbert-base-uncased'
tokenizer = transformers.AutoTokenizer.from_pretrained(base)

train_dataset, valid_dataset = build_dataset(train, valid, tokenizer)
train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset)

print("Length of Train Dataloader: ", len(train_data_loader))
print("Length of Valid Dataloader: ", len(valid_data_loader))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

n_train_steps = int(len(train_dataset) / BATCH_SIZE * 10)

model = get_model(base)
optimizer = get_optimizer(model)
scheduler = get_scheduler(optimizer, n_train_steps)

model.to(device)

log = {}

best_val_loss = np.inf

for epoch in range(EPOCHS):
    print(f'Epoch {epoch} starting')
    train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
    eval_loss, preds, labels = eval_fn(valid_data_loader, model, device)
    
    auc_score = log_metrics(preds, labels)["auc_micro"]
    print("AUC score: ", auc_score)
    avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(valid_data_loader)
    
    log[epoch + 1] = {
        "train_loss": avg_train_loss,
        "val_loss": avg_val_loss,
        "auc_score": auc_score,
    }

    print("Average Train loss: ", avg_train_loss)
    print("Average Valid loss: ", avg_val_loss)

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "./best_model.pt")  
        print("Model saved as current val_loss is: ", best_val_loss) 

Length of Train Dataloader:  2714
Length of Valid Dataloader:  340


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'classifier.bias', 'pre_classifier.we

Epoch 0 starting


                                                                

AUC score:  0.9490670460823749
Average Train loss:  0.12044907149748277
Average Valid loss:  0.08805634068215594
Model saved as current val_loss is:  0.08805634068215594
Epoch 1 starting


                                                                

AUC score:  0.9575188830185227
Average Train loss:  0.08296550197393668
Average Valid loss:  0.08330132675302379
Model saved as current val_loss is:  0.08330132675302379
Epoch 2 starting


                                                                

AUC score:  0.9579044720232768
Average Train loss:  0.07227193667809359
Average Valid loss:  0.0837940646609401
Epoch 3 starting


                                                                

AUC score:  0.9554431703277195
Average Train loss:  0.06205002740217687
Average Valid loss:  0.08805994073695997
Epoch 4 starting


                                                                

AUC score:  0.9511291737515543
Average Train loss:  0.05237170420662557
Average Valid loss:  0.09458804812063189


