In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/78/92cedda05552398352ed9784908b834ee32a0bd071a9b32de287327370b7/transformers-2.8.0-py3-none-any.whl (563kB)
[K     |████████████████████████████████| 573kB 2.8MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 13.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 19.9MB/s 
Collecting tokenizers==0.5.2
[?25l  Downloading https://files.pythonhosted.org/packages/d1/3f/73c881ea4723e43c1e9acf317cf407fab3a278daab3a69c98dcac511c04f/tokenizers-0.5.2-cp36-cp36m-manylinux1_x86_64.whl (3.7MB)
[K     |██████████

In [2]:
import os
import math

import torch
from torch.nn import BCEWithLogitsLoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, XLNetTokenizer, XLNetModel, XLNetLMHeadModel, XLNetConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [3]:
print("GPU Available: {}".format(torch.cuda.is_available()))
n_gpu = torch.cuda.device_count()
print("Number of GPU Available: {}".format(n_gpu))
print("GPU: {}".format(torch.cuda.get_device_name(0)))

GPU Available: True
Number of GPU Available: 1
GPU: Tesla P100-PCIE-16GB


In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
!unzip '/content/drive/My Drive/Deep Learning Data/Jigsaw Toxic Comment/train.csv.zip'
!unzip '/content/drive/My Drive/Deep Learning Data/Jigsaw Toxic Comment/test.csv.zip'

Archive:  /content/drive/My Drive/Deep Learning Data/Jigsaw Toxic Comment/train.csv.zip
  inflating: train.csv               
Archive:  /content/drive/My Drive/Deep Learning Data/Jigsaw Toxic Comment/test.csv.zip
  inflating: test.csv                


In [0]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [7]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [9]:
### tokenize items in training examples

tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case = True)

HBox(children=(IntProgress(value=0, description='Downloading', max=798011, style=ProgressStyle(description_wid…




In [0]:
train_text_list= train['comment_text'].values

In [0]:
## Tokenize the train dataset

input_ids = []
attention_mask = []


for _ in range(len(train)):
    
    encoded_sent = tokenizer.encode_plus(train['comment_text'][_],
                                         add_special_tokens = True, #adding [CLS] and [SEP] at the beginning and endding of sentences
                                         max_length = 128, #maximum length of the sentences
                                         pad_to_max_length = True,##pad short sentences to reach max length
                                         return_attention_mask = True, 
                                         return_token_type_ids = False,
                                         return_tensor = 'pt' )
    
    ids = torch.from_numpy(np.array(encoded_sent['input_ids'])).unsqueeze(0) ## create tensor of shape (1 x max_length)
    masks = torch.from_numpy(np.array(encoded_sent['attention_mask'])).unsqueeze(0)

    input_ids.append(ids)
    attention_mask.append(masks)


##input and attention tensors
input_ids = torch.cat(input_ids)
attention_masks = torch.cat(attention_mask)

#label tensor
labels = train.iloc[:, 2:].to_numpy()
labels = torch.from_numpy(labels).type(torch.float32)




In [16]:
input_ids.shape

torch.Size([159571, 128])

In [17]:
labels.shape

torch.Size([159571, 6])

In [0]:
from torch.utils.data import TensorDataset, DataLoader, random_split
from collections import namedtuple
import random

In [0]:
AdaptationConfig = namedtuple('AdaptationConfig',
  field_names="num_classes, dropout, initializer_range, batch_size, lr, max_norm, n_epochs,"
              "n_warmup, valid_set_prop, gradient_accumulation_steps, device,"
              "log_dir, dataset_cache")
adapt_args = AdaptationConfig(
               6          , 0.1    , 0.02             , 16        , 6.5e-5, 1.0   , 3,
               10      , 0.1           , 1, "cuda" if torch.cuda.is_available() else "cpu",
               "./"   , "./dataset_cache.bin")

In [0]:
## Splitting training and validation sets

dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(len(dataset) * (1-adapt_args.valid_set_prop))

val_size = len(dataset) - train_size

train_set, val_set = random_split(dataset, [train_size, val_size])

#Load train and val sets in form of DataLoader pytorch

train_loader = DataLoader(train_set, batch_size = adapt_args.batch_size, shuffle = True, num_workers = 1)

val_loader = DataLoader(val_set, batch_size = adapt_args.batch_size, shuffle = False, num_workers = 1)

In [0]:
#### Creating our model with pretrained XL NEt and multi_class classification head
xlnet_pretrained = XLNetModel.from_pretrained('xlnet-base-cased')
xlnet_config = XLNetConfig.from_pretrained('xlnet-base-cased')

In [50]:
import torch.nn as nn
import torch.nn.functional as F

class multiclass_xlnet(nn.Module):
    def __init__(self, config, adapt_args):
        super().__init__()

        self.config = xlnet_config
        self.transformer = xlnet_pretrained
        self.fine_tune = adapt_args

        self.classification_head = nn.Linear(config.d_model, adapt_args.num_classes)
        self.apply(self.init_weights)

    def init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_normal_(module.weight)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)


    def forward(self, inputs, attention_masks, clf_labels):

        t_output = self.transformer(input_ids = inputs, 
                                           attention_mask = attention_masks,
                                           token_type_ids = None, 
                                           ) ## Return last hidden state tensor (batch_size, seq_lengths, d_model)

        hidden_states = t_output[0]
        avg_hidden_state = torch.mean(hidden_states, dim =1 ) # return tensor of (batch_size, d_model)

        clf_hidden_state = self.classification_head(avg_hidden_state) ## return tensor of (batch_size, num_classes)

        clf_logits = torch.sigmoid(clf_hidden_state)  # return tensor of (batch_size, num_classes) with values in range probability (0,1)

        # prediction labels based on probability of clf_logits
        ones = torch.ones((clf_logits.shape)).to(adapt_args.device)
        zeros = torch.zeros((clf_logits.shape)).to(adapt_args.device)
        output_labels = torch.where(clf_logits > 0.5, ones, zeros)

        loss_fct = nn.BCEWithLogitsLoss()
        loss = loss_fct(clf_hidden_state, clf_labels)


        return loss, output_labels, clf_hidden_state


    
model = multiclass_xlnet(xlnet_config, adapt_args)
model.to(adapt_args.device)

multiclass_xlnet(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (ff): XLNetFeedForward(
          (layer_norm): LayerNorm((768,), eps=1e-12, elementwi

In [0]:
## Test block for one forward batch
sample_inputs, sample_attentions, sample_labels= next(iter(train_loader))

sample_loss, sample_output, sample_hiddens = model(inputs = sample_inputs.to(adapt_args.device), 
                                                   attention_masks = sample_attentions.to(adapt_args.device), 
                                                   clf_labels = sample_labels.to(adapt_args.device))

In [36]:
!pip install pytorch-ignite

import os
from ignite.engine import Engine, Events
from ignite.metrics import RunningAverage, Accuracy
from ignite.handlers import ModelCheckpoint
from ignite.contrib.handlers import CosineAnnealingScheduler, PiecewiseLinear, create_lr_scheduler_with_warmup, ProgressBar



In [0]:
## Training loop
optimizer = torch.optim.Adam(model.parameters(), lr = adapt_args.lr)


##update
def update(engine, batch):

    #training mode
    model.train()

    #zero gradient
    optimizer.zero_grad()

    #unwrap data from batch
    inputs = batch[0].to(adapt_args.device)
    
    attention_mask = batch[1].to(adapt_args.device)

    labels = batch[2].to(adapt_args.device)

    ## 1 pass forward of the model
    loss, output_labels, hidden_states = model(inputs, attention_mask, labels)

    loss = loss/adapt_args.gradient_accumulation_steps ## accumulate loss 

    loss.backward()

    ##clipping gradieng

    torch.nn.utils.clip_grad_norm_(model.parameters(), adapt_args.max_norm)

    if engine.state.iteration % adapt_args.gradient_accumulation_steps ==0:
        optimizer.step()
        optimizer.zero_grad()
    return loss.item()

trainer = Engine(update)






In [0]:
##Evaluation function, evaluator output is the input of the metrics
def inference(engine, batch):

    ##evaluation mode

    model.eval()

    inputs = batch[0].to(adapt_args.device)

    attention_mask = batch[1].to(adapt_args.device)

    labels = batch[2].to(adapt_args.device)

    with torch.no_grad():
        loss, output, hidden = model(inputs, attention_mask, labels)

    return output, labels

evaluator = Engine(inference)

In [74]:
### Attach metric to evaluator and evaluation to trainner : evaluate on val set after each epoch
Accuracy().attach(evaluator, 'Accuracy')
@trainer.on(Events.EPOCH_COMPLETED)
def log_validation_result(engine):
    evaluator.run(val_loader)
    print(f"Validation Epoch: {engine.state.epoch} Error rate: {100*(1 - evaluator.state.metrics['Accuracy'])}")

# Learning rate schedule: linearly warm-up to lr and then to zero
scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 0.0), (adapt_args.n_warmup, adapt_args.lr),
                                              (len(train_loader)*adapt_args.n_epochs, 0.0)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)


# Add progressbar with loss
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
ProgressBar(persist=True).attach(trainer, metric_names=['loss'])

# Save checkpoints and finetuning config
checkpoint_handler = ModelCheckpoint(adapt_args.log_dir, 'finetuning_checkpoint', save_interval=1, require_empty=False)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': model})
torch.save(model.state_dict(), os.path.join(adapt_args.log_dir, 'fine_tuning_args.bin'))



In [0]:
trainer.run(train_loader, max_epochs=adapt_args.n_epochs)

HBox(children=(IntProgress(value=0, max=8976), HTML(value='')))

In [0]:
#Since it takes forever on Google Colab, i will not fully train the models
