In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [None]:
import pandas as pd
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split
from transformers import AutoTokenizer, DistilBertModel, AdamW, get_linear_schedule_with_warmup
import time
import datetime
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

### Preparing Environment

In [None]:
if torch.cuda.is_available():
    # torch.cuda.set_device(0)
    device = torch.device('cuda')
    print('Using GPU: ', torch.cuda.current_device())
else:
    device = torch.device('cpu')

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

Using GPU:  0


### Useful functions

In [None]:
def get_dataset(df, tokenizer, mode='train'):
    sentences, labels = df['comment_text'], df.iloc[:,2:].to_numpy()
    max_length = 300
    in_T = []
    in_T_attn_masks = []
    for sentence in sentences:
        enc_sent_dict = tokenizer(
            sentence[:300],
            max_length = max_length,
            add_special_tokens = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        in_T.append(enc_sent_dict['input_ids'])
        in_T_attn_masks.append(enc_sent_dict['attention_mask'])
    
    in_T = torch.cat(in_T, dim=0)
    in_T_attn_masks = torch.cat(in_T_attn_masks, dim=0)
    labels = torch.tensor(labels, dtype = torch.float32)
    print('Text Input: ' , in_T.shape)
    print('Text Input Attention: ' , in_T_attn_masks.shape)    
    print('Labels: ' , labels.shape)
    
    dataset = TensorDataset(
        in_T,
        in_T_attn_masks,
        labels
    )
    
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    return train_dataset, val_dataset

In [None]:
class MultiTaskClassifier(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super(MultiTaskClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_labels = num_labels
        
        self.bertmodel = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.ffn1 = nn.Linear(768, hidden_dim)
        self.dp1 = nn.Dropout()
        self.ffn2 = nn.Linear(hidden_dim, num_labels)
        
    def forward(self, in_T, in_T_attn_masks):
        hidden_states = self.bertmodel(in_T, in_T_attn_masks).last_hidden_state
        x = torch.mean(hidden_states, dim=1)
        x = F.relu(self.ffn1(x))
        x = self.dp1(x)
        x = torch.sigmoid(self.ffn2(x))
        return x

In [None]:
def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

### Preparing dataset

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
train_dataset, val_dataset = get_dataset(
        pd.read_csv('train.csv',engine='python', encoding='utf-8', error_bad_lines=False),
    tokenizer = tokenizer,
    mode = 'train'
)

batch_size = 8
train_dataloader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    sampler = RandomSampler(train_dataset)
)
val_dataloader = DataLoader(
    val_dataset,
    batch_size = batch_size,
    sampler = SequentialSampler(val_dataset)
)




  pd.read_csv('train.csv',engine='python', encoding='utf-8', error_bad_lines=False),
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


KeyboardInterrupt: ignored

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Preparing the model

In [None]:
model = MultiTaskClassifier(100, 6).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
criterion = nn.BCELoss()

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Training & Validation

In [None]:
epochs = 6
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps = 0,
                                           num_training_steps = total_steps)


training_stats = []
total_t0 = time.time()

best_val_loss = 1e8
true_labels = val_dataset[:][2].numpy()

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        if step % 1000 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}. Loss: {:.5f}'.format(step, len(train_dataloader), elapsed, total_train_loss/step))

        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)
        
        model.zero_grad()

        logits = model(b_in_T, b_in_T_attn_masks)
        loss = criterion(logits, b_labels)

        total_train_loss += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))

    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time()

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    pred_labels = np.empty((0,6))

    # Evaluate data for one epoch
    for batch in val_dataloader:
        
        b_in_T            = batch[0].to(device)
        b_in_T_attn_masks = batch[1].to(device)
        b_labels          = batch[2].to(device)

        with torch.no_grad():
            logits = model(b_in_T, b_in_T_attn_masks)
            loss = criterion(logits, b_labels)

        # Accumulate the validation loss.
        total_eval_loss += loss.item()

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        pred_labels = np.concatenate((pred_labels, logits), axis=0)


    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(val_dataloader)

    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    pred_labels = np.array([[int(x >= 0.25) for x in pred_labels[:,i]] for i  in range(6)]).transpose()

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

#     Report the final accuracy, f1-score for this validation run.
    for i in range(6):
        print("  Accuracy: {0:.2f}".format(accuracy_score(true_labels[:,i], pred_labels[:,i])))

    for i in range(6):
        print("  Macro F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='macro')))

    for i in range(6):
        print("  Weighted F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='weighted')))

    print('Classification Report:')
    for i in range(6):
        print(classification_report(true_labels[:,i], pred_labels[:,i]))

    print('Confusion Matrix:')
    for i in range(6):
        print(confusion_matrix(true_labels[:,i], pred_labels[:,i]))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'training_loss': avg_train_loss,
            'val_loss': avg_val_loss,
            'val_accuracy': np.mean([accuracy_score(true_labels[:,i], pred_labels[:,i]) for i in range(6)]),
            'val_macro_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='macro') for i in range(6)]),
            'val_weighted_f1': np.mean([f1_score(true_labels[:,i], pred_labels[:,i], average='weighted') for i in range(6)]),
            'training_time': training_time,
            'val_tim': validation_time
        }
    )

    model_path = 'model_state_dict_'+str(epoch_i)+'.pt'
    torch.save(model.state_dict(), model_path)

print("")
stats_path = 'training_stats_pickle'
pd.DataFrame(training_stats).to_pickle(stats_path)

print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))


Training...
  Batch 1,000  of  1,436.    Elapsed: 0:03:58. Loss: 0.04602

  Average training loss: 0.05
  Training epcoh took: 0:05:43

Running Validation...
  Validation Loss: 0.06
  Validation took: 0:00:30
  Accuracy: 0.95
  Accuracy: 0.99
  Accuracy: 0.98
  Accuracy: 1.00
  Accuracy: 0.96
  Accuracy: 0.99
  Macro F1-score: 0.87
  Macro F1-score: 0.72
  Macro F1-score: 0.90
  Macro F1-score: 0.50
  Macro F1-score: 0.84
  Macro F1-score: 0.63
  Weighted F1-score: 0.95
  Weighted F1-score: 0.98
  Weighted F1-score: 0.98
  Weighted F1-score: 0.99
  Weighted F1-score: 0.96
  Weighted F1-score: 0.99
Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97      2566
         1.0       0.72      0.82      0.77       306

    accuracy                           0.95      2872
   macro avg       0.85      0.89      0.87      2872
weighted avg       0.95      0.95      0.95      2872

              precision    recall  f1-score 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[[2623   93]
 [  19  137]]
[[2834    2]
 [  30    6]]

Training...
  Batch 1,000  of  1,436.    Elapsed: 0:04:02. Loss: 0.03690

  Average training loss: 0.04
  Training epcoh took: 0:05:48

Running Validation...
  Validation Loss: 0.07
  Validation took: 0:00:30
  Accuracy: 0.95
  Accuracy: 0.99
  Accuracy: 0.98
  Accuracy: 1.00
  Accuracy: 0.97
  Accuracy: 0.99
  Macro F1-score: 0.87
  Macro F1-score: 0.69
  Macro F1-score: 0.91
  Macro F1-score: 0.50
  Macro F1-score: 0.86
  Macro F1-score: 0.66
  Weighted F1-score: 0.95
  Weighted F1-score: 0.99
  Weighted F1-score: 0.98
  Weighted F1-score: 0.99
  Weighted F1-score: 0.97
  Weighted F1-score: 0.99
Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.97      2566
         1.0       0.74      0.82      0.78       306

    accuracy                           0.95      2872
   macro avg       0.86      0.89      0.87      2872
weighted avg       0.95      0.95      0.95  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Training...
  Batch 1,000  of  1,436.    Elapsed: 0:04:02. Loss: 0.02868

  Average training loss: 0.03
  Training epcoh took: 0:05:48

Running Validation...
  Validation Loss: 0.08
  Validation took: 0:00:30
  Accuracy: 0.96
  Accuracy: 0.98
  Accuracy: 0.98
  Accuracy: 1.00
  Accuracy: 0.97
  Accuracy: 0.99
  Macro F1-score: 0.89
  Macro F1-score: 0.73
  Macro F1-score: 0.90
  Macro F1-score: 0.57
  Macro F1-score: 0.86
  Macro F1-score: 0.71
  Weighted F1-score: 0.96
  Weighted F1-score: 0.98
  Weighted F1-score: 0.98
  Weighted F1-score: 0.99
  Weighted F1-score: 0.97
  Weighted F1-score: 0.99
Classification Report:
              precision    recall  f1-score   support

         0.0       0.98      0.98      0.98      2566
         1.0       0.80      0.80      0.80       306

    accuracy                           0.96      2872
   macro avg       0.89      0.89      0.89      2872
weighted avg       0.96      0.96      0.96      2872

              precision    recall  f1-score 

In [None]:
test_dfff=pd.read_csv('test.csv',engine='python', encoding='utf-8', error_bad_lines=False)
test_labels_dfff=pd.read_csv('test_labels.csv',engine='python', encoding='utf-8', error_bad_lines=False)
train_dfff=pd.read_csv('train.csv',engine='python', encoding='utf-8', error_bad_lines=False)



  test_dfff=pd.read_csv('test.csv',engine='python', encoding='utf-8', error_bad_lines=False)


  test_labels_dfff=pd.read_csv('test_labels.csv',engine='python', encoding='utf-8', error_bad_lines=False)


  train_dfff=pd.read_csv('train.csv',engine='python', encoding='utf-8', error_bad_lines=False)


In [None]:
test_dataset, _ = get_dataset(
       pd.merge(test_dfff, test_labels_dfff, on='id'),
    tokenizer = tokenizer
)

batch_size = 8
test_dataloader = DataLoader(
    test_dataset,
    batch_size = batch_size,
    sampler = RandomSampler(test_dataset)
)



Text Input:  torch.Size([153164, 300])
Text Input Attention:  torch.Size([153164, 300])
Labels:  torch.Size([153164, 6])


In [None]:
# Evaluate data for one epoch
true_labels = test_dataset[:][2].numpy()

for batch in test_dataloader:
    
    b_in_T            = batch[0].to(device)
    b_in_T_attn_masks = batch[1].to(device)
    b_labels          = batch[2].to(device)
    with torch.no_grad():
        logits = model(b_in_T, b_in_T_attn_masks)
        loss = criterion(logits, b_labels)
    # Accumulate the validation loss.
    total_eval_loss += loss.item()
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    pred_labels = np.concatenate((pred_labels, logits), axis=0)
# Calculate the average loss over all of the batches.
avg_test_loss = total_eval_loss / len(test_dataloader)
# Measure how long the validation run took.
test_time = format_time(time.time() - t0)
pred_labels = np.array([[int(x >= 0.25) for x in pred_labels[:,i]] for i  in range(6)]).transpose()
print("  Test Loss: {0:.2f}".format(avg_test_loss))
print("  Test took: {:}".format(test_time))
  #Report the final accuracy, f1-score for this validation run.
for i in range(6):
    print("  Accuracy: {0:.2f}".format(accuracy_score(true_labels[:,i], pred_labels[:,i])))
for i in range(6):
    print("  Macro F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='macro')))
for i in range(6):
    print("  Weighted F1-score: {0:.2f}".format(f1_score(true_labels[:,i], pred_labels[:,i], average='weighted')))
print('Classification Report:')
for i in range(6):
    print(classification_report(true_labels[:,i], pred_labels[:,i]))
print('Confusion Matrix:')
for i in range(6):
    print(confusion_matrix(true_labels[:,i], pred_labels[:,i]))