Reference: https://colab.research.google.com/github/rap12391/transformers_multilabel_toxic/blob/master/toxic_multilabel.ipynb

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from tqdm import tqdm, trange
from ast import literal_eval
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

# Data loading

In [14]:


root_path = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
data_path = f'{root_path}/dataset'

In [15]:
df_train = pd.read_csv(f"{data_path}/stage1/train.csv")
df_val = pd.read_csv(f"{data_path}/stage1/val.csv")
df_test = pd.read_csv(f"{data_path}/stage1/test.csv")


In [16]:
df_train['label'] = df_train['label'].apply(lambda x: ast.literal_eval(x))
df_val['label'] = df_val['label'].apply(lambda x: ast.literal_eval(x))
df_test['label'] = df_test['label'].apply(lambda x: ast.literal_eval(x))

In [17]:
df_train['processed_data'] =df_train['processed_data'].astype("str")
df_val['processed_data'] = df_val['processed_data'].astype("str")
df_test['processed_data'] =df_test['processed_data'].astype("str")

# Tokenizer and model loading

## Phobert base

In [18]:
def phobert_base_tokenizer_loading():
    tokenizer = PhobertTokenizer.from_pretrained('vinai/phobert-base', do_lower_case=True) # tokenizer
    return tokenizer
def phobert_base_model_loading():
    model = RobertaForSequenceClassification.from_pretrained("vinai/phobert-base", num_labels=8)
    return model

## Phobert-large

In [19]:
def phobert_large_tokenizer_loading():
    tokenizer = PhobertTokenizer.from_pretrained('vinai/phobert-large', do_lower_case=True) # tokenizer
    return tokenizer
def phobert_large_model_loading():

    model = RobertaForSequenceClassification.from_pretrained("vinai/phobert-large", num_labels=8)
    return model

## Velectra base

In [None]:
def velectra_base_tokenizer_loading():
    tokenizer = ElectraTokenizer.from_pretrained('FPTAI/velectra-base-discriminator-cased', do_lower_case=True) # tokenizer
    return tokenizer

def velectra_base_model_loading():
    model = ElectraForSequenceClassification.from_pretrained("FPTAI/velectra-base-discriminator-cased", num_labels=8)
    return model

## Bert-base

In [None]:
def bert_base_tokenizer_loading():
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # tokenizer
    return tokenizer
def bert_base_model_loading():
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=8)
    return model

## Bert-large

In [None]:
def bert_large_tokenizer_loading():
    tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True) # tokenizer
    return tokenizer

def bert_large_model_loading():
    model = BertForSequenceClassification.from_pretrained("bert-large-uncased", num_labels=8)
    return model

## Distilbert base

In [None]:
def distilbert_base_tokenizer_loading():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True) # tokenizer
    return tokenizer

def distilbert_base_model_loading():
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=8)
    return model

## Distilbert large

In [None]:
def distilbert_large_tokenizer_loading():
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased', do_lower_case=True) # tokenizer
    return tokenizer

def distilbert_large_model_loading():
    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased", num_labels=8)
    return model

## XLM_roberta_base

In [None]:
def xlm_base_tokenizer_loading():
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', do_lower_case=True) # tokenizer
    return tokenizer

def xlm_base_model_loading():
    model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=8)
    return model


## XLM_roberta_large

In [None]:
def xlm_large_tokenizer_loading():
    tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large', do_lower_case=True) # tokenizer
    return tokenizer

def xlm_large_model_loading():
    model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-large", num_labels=8)
    return model

## ViBert-base

In [None]:
def vibert_base_tokenizer_loading():
    tokenizer = BertTokenizer.from_pretrained('FPTAI/vibert-base-cased', do_lower_case=True) # tokenizer
    return tokenizer

def vibert_base_model_loading():
    model = BertForSequenceClassification.from_pretrained("FPTAI/vibert-base-cased", num_labels=8)
    return model

## Roberta-base

In [None]:
def roberta_base_tokenizer_loading():
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base", do_lower_case=True)
    return tokenizer

def roberta_base_model_loading():
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=8)
    return model

## Roberta-large

In [None]:
def roberta_large_tokenizer_loading():
    tokenizer = RobertaTokenizer.from_pretrained("roberta-large", do_lower_case=True)
    return tokenizer

def roberta_large_model_loading():
    model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=8)
    return model

# Encoding loading

In [None]:
# Load all
# Load tokenizer
max_length = 93

model_dict = {
    'phobert_base': [phobert_base_tokenizer_loading(), phobert_base_model_loading()],
    'phobert_large': [phobert_large_tokenizer_loading(), phobert_large_model_loading()],
    'velectra_base': [velectra_base_tokenizer_loading(), velectra_base_model_loading()],
    'bert_base': [bert_base_tokenizer_loading(), bert_base_model_loading()],
    'bert_large': [bert_large_tokenizer_loading(), bert_large_model_loading()],
    'distilbert_base': [distilbert_base_tokenizer_loading(), distilbert_base_model_loading()],
    'distilbert_large': [distilbert_large_tokenizer_loading, distilbert_large_model_loading()],
    'xlm_base': [xlm_base_tokenizer_loading(), xlm_base_model_loading()],
    'xlm_large': [xlm_large_tokenizer_loading(), xlm_large_model_loading()],
    'vibert_base': [vibert_base_tokenizer_loading(), vibert_base_model_loading()],
    'roberta_base': [roberta_base_tokenizer_loading(), roberta_base_model_loading()],
    'roberta_large': [roberta_large_tokenizer_loading(), roberta_large_model_loading()]
    
}
## Choose the model list here
tokenizer = model_dict['phobert_base'][0]

train_encodings = tokenizer.batch_encode_plus(df_train['processed_data'],max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
val_encodings = tokenizer.batch_encode_plus(df_val['processed_data'],max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
test_encodings = tokenizer.batch_encode_plus(df_test['processed_data'],max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method

In [None]:
train_input_ids = train_encodings['input_ids']
train_attention_masks = train_encodings['attention_mask']

test_input_ids = test_encodings['input_ids'] 
test_attention_masks = test_encodings['attention_mask']

val_input_ids = val_encodings['input_ids']
val_attention_masks = val_encodings['attention_mask']

In [None]:
train_inputs = torch.tensor(train_input_ids)
train_labels = torch.tensor(df_train['label'].values.tolist())
train_masks = torch.tensor(train_attention_masks)

test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(df_test['label'].values.tolist())
test_masks = torch.tensor(test_attention_masks)

val_inputs = torch.tensor(val_input_ids)
val_labels = torch.tensor(df_val['label'].values.tolist())
val_masks = torch.tensor(val_attention_masks)


In [None]:

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
model = model_dict['phobert_base'][1]
model.cuda()

# Model traning

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 10
num_labels = 8

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # # Forward pass for multiclass classification
    # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    # loss = outputs[0]
    # logits = outputs[1]

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################


## Evaluate on val set

In [None]:
# Validation

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

# Variables to gather full output
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(val_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Calculate Accuracy
threshold = 0.50
pred_bools = [pl>threshold for pl in pred_labels]
true_bools = [tl==1 for tl in true_labels]
val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

print('F1 Validation Accuracy: ', val_f1_accuracy)
print('Flat Validation Accuracy: ', val_flat_accuracy)

In [None]:
def create_pred(pred_labels: list):
  list_aspect = ['stayingpower', 'texture', 'smell', 'price', 'others', 'colour', 'shipping', 'packing']
  pred_dict = {}
  for asp in list_aspect:
    pred_dict[asp] = []
  for i in range(len(pred_labels)):
    for index in range(len(list_aspect)):
      pred_dict.get(list_aspect[index]).append(1 if pred_labels[i][index] >0.5 else 0)
  return pred_dict

In [None]:
pred_dict = create_pred(pred_labels)
true_dict = create_pred(true_labels)

In [None]:
list_aspect = ['stayingpower', 'texture', 'smell', 'price', 'others', 'colour', 'shipping', 'packing']
for i in list_aspect:
  print(i)
  print(classification_report(true_dict[i], pred_dict[i]))
  report = classification_report(true_dict[i], pred_dict[i], output_dict=True)
  df = pd.DataFrame(report).transpose()
  df.to_csv(f"{root_path}/result/stage1_aspect_detection/distilbert_large/val/{i}.csv", index=True)
  print("*"*50)

## Evaluate on test set

In [None]:
# Validation

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

# Variables to gather full output
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Calculate Accuracy
threshold = 0.50
pred_bools = [pl>threshold for pl in pred_labels]
true_bools = [tl==1 for tl in true_labels]
val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

print('F1 Validation Accuracy: ', val_f1_accuracy)
print('Flat Validation Accuracy: ', val_flat_accuracy)

In [None]:
def create_pred(pred_labels: list):
  list_aspect = ['stayingpower', 'texture', 'smell', 'price', 'others', 'colour', 'shipping', 'packing']
  pred_dict = {}
  for asp in list_aspect:
    pred_dict[asp] = []
  for i in range(len(pred_labels)):
    for index in range(len(list_aspect)):
      pred_dict.get(list_aspect[index]).append(1 if pred_labels[i][index] >0.5 else 0)
  return pred_dict

In [None]:
pred_dict = create_pred(pred_labels)
true_dict = create_pred(true_labels)

In [None]:
list_aspect = ['stayingpower', 'texture', 'smell', 'price', 'others', 'colour', 'shipping', 'packing']
for i in list_aspect:
  print(i)
  print(classification_report(true_dict[i], pred_dict[i]))
  report = classification_report(true_dict[i], pred_dict[i], output_dict=True)
  df = pd.DataFrame(report).transpose()
  df.to_csv(f"{root_path}/result/stage1_aspect_detection/distilbert_large/test/{i}.csv", index=True)
  print("*"*50)

## Save model

In [None]:
torch.save(model, '/content/output')

In [None]:
model = torch.load('/content/output')