In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/jigsaw-translated/test_en.csv
/kaggle/input/jigsaw-translated/validation_en.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/test-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv
/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train-processed-seqlen128.csv


In [2]:
! pip install transformers



In [3]:
device_type = 'gpu' # or gpu, or cpu or tpu
translated = True

In [4]:
import tensorflow
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
from tqdm import tqdm, trange
import pandas as pd
import numpy as np
import io
import os
import time
import random
import matplotlib.pyplot as plt
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix, roc_auc_score
import matplotlib
import matplotlib.pyplot as plt

from transformers import *

Using TensorFlow backend.


In [5]:
if device_type == 'tpu':
    !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
    !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev
    import torch_xla
    import torch_xla.core.xla_model as xm

In [6]:
def seed_everything(seed):
    """
    Seeds basic parameters for reproductibility of results
    
    Arguments:
        seed {int} -- Number of the seed
    """
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

if device_type == 'gpu':
    ## Set seed of randomization and working device
    manual_seed = 77
    torch.manual_seed(manual_seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    n_gpu = torch.cuda.device_count()
    if n_gpu > 0:
        torch.cuda.manual_seed(manual_seed)
        print(torch.cuda.get_device_name(0))
elif device_type == 'tpu':
    seed = 777
    seed_everything(seed)
    device = xm.xla_device()
    print(device)

cuda
Tesla P100-PCIE-16GB


In [7]:
MODELS = [(BertModel,       BertTokenizer,       'bert-base-multilingual-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-large'),
         ]

### DATA PREPROCESSING

In [8]:
data_folder = "/kaggle/input/jigsaw-multilingual-toxic-comment-classification"
train_toxic_comment_path = os.path.join(data_folder, 'jigsaw-toxic-comment-train.csv')
# train_unintended_bias_path = os.path.join(data_folder, 'jigsaw-unintended-bias-train.csv')
if translated:
    valid_path = os.path.join('/kaggle/input/jigsaw-translated', 'validation_en.csv')
    test_path = os.path.join('/kaggle/input/jigsaw-translated', 'test_en.csv')
else:
    valid_path = os.path.join(data_folder, 'validation.csv')
    test_path = os.path.join(data_folder, 'test.csv')

train_toxic_comment_bert_path = os.path.join(data_folder, 'jigsaw-toxic-comment-train-processed-seqlen128.csv') 
valid_bert_path = os.path.join(data_folder, 'validation-processed-seqlen128.csv')
test_bert_path = os.path.join(data_folder, 'test-processed-seqlen128.csv')

In [9]:
def balance_data(data, ratio = 0.2):
    '''
    if we need to balance our data...
    '''
    inputs, labels = data
    downsampled_inputs = []
    downsampled_labels = []
    for i in range(len(inputs)):
        if labels[i] == 1.0 or np.random.rand() < ratio:
            downsampled_inputs.append(inputs[i])
            downsampled_labels.append(labels[i])

    return downsampled_inputs, downsampled_labels

In [10]:
def data_prepare(file_path, tokenizer, model_type, downsample_ratio = 0.2, max_len = 128, mode = 'train'):
    '''
    file_path: the path to input file. 
                In train mode, the input must be a tsv file that includes two columns where the first is text, and second column is label.
                The first row must be header of columns.

                In predict mode, the input must be a tsv file that includes only one column where the first is text.
                The first row must be header of column.

    tokenizer: BERT tokenizer
    max_len: maximal length of input sequence
    mode: train or predict
    '''
    # if we are in train mode, we will load two columns (i.e., text and label).
    if mode == 'train':
        # Use pandas to load dataset
        df = pd.read_csv(file_path, header=0, 
                         names=['id', 'comment_text','toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])
        labels = df.toxic.values
        # Create sentence and label lists
        content = df.comment_text.values
#         content, labels = balance_data((content, labels), downsample_ratio)
        content, labels = content[:80000], labels[:80000]
        
        # Convert data into torch tensors
        labels = torch.tensor(labels, dtype=torch.float)

    elif mode == 'valid':
        if translated:
            df = pd.read_csv(file_path,header=0, 
                         names=['id', 'comment_text', 'lang', 'toxic', 'comment_text_en'])
            content = df.comment_text_en.values
        else:
            df = pd.read_csv(file_path,header=0, 
                            names=['id', 'comment_text', 'lang', 'toxic'])
            content = df.comment_text.values
        
        labels = df.toxic.values
        # Convert data into torch tensors
        labels = torch.tensor(labels, dtype=torch.float)
        

    # if we are in predict mode, we will load one column (i.e., text).
    elif mode == 'predict':
        if translated:
            df = pd.read_csv(file_path,header=0, names=['id', 'content', 'lang', 'content_en'])
            content = df.content_en.values
        else:
            df = pd.read_csv(file_path,header=0, names=['id', 'content', 'lang'])
            content = df.content.values

        # create placeholder
        labels = []
    else:
        print("the type of mode should be either 'train', 'valid' or 'predict'. ")
        return
        

    # We need to add a special token at the beginning for BERT/XLM-Roberta to work properly.
    # Import the tokenizer, used to convert our text into tokens that correspond to BERT's/XLM-Roberta's vocabulary.
    tokenized_texts = [tokenizer.tokenize(text) for text in content]
    
    if model_type == 'BERT':
        cls_token = '[CLS]'
        pad_token = '[PAD]'
        eos_token = '[SEP]'
    if model_type == 'XLMROBERTA':
        cls_token = '<s>'
        pad_token = '<pad>'
        eos_token = '</s>'

    # BERT: [CLS] + tokens + [SEP] + paddings
    # XLM-Roberta: [CLS] + prefix_space + tokens + [SEP] + paddings
    tokenized_texts = [[cls_token] + text for text in tokenized_texts]
    
    # if the sequence is longer the maximal length, we truncate it to the pre-defined maximal length
    tokenized_texts = [ text[:max_len+1] for text in tokenized_texts]

    # We also need to add a special token at the end.
    tokenized_texts = [ text+[eos_token] for text in tokenized_texts]
    
    # Use the tokenizer to convert the tokens to their index numbers in the vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

    # Pad our input seqeunce to the fixed length (i.e., max_len) with index of [PAD] token
    pad_ind = tokenizer.convert_tokens_to_ids([pad_token])[0]
    input_ids = pad_sequences(input_ids, maxlen=max_len+2, dtype="long", truncating="post", padding="post", value=pad_ind)

    # Create attention masks
    attention_masks = []

    # Create a mask of 1s for each token followed by 0s for pad tokens
    for seq in input_ids:
        seq_mask = [float(i != pad_ind) for i in seq]
        attention_masks.append(seq_mask)

    # Convert all of our data into torch tensors, the required datatype for our model
    inputs = torch.tensor(input_ids)
    masks = torch.tensor(attention_masks)

    # Save it to csv file, if necessary
    # inputs = [tuple(input_id) for input_id in input_ids]
    # masks = [tuple(attention_mask) for attention_mask in attention_masks]
    # df['input_word_ids'] = inputs
    # df['input_mask'] = masks
    # file_name = file_path.split('/')[-1]
    # df.to_csv(os.path.join(data_folder, file_name[:-4]+'-processed-seqlen128-xmlroberta.csv'), index=False)

    return inputs, labels, masks

### LOAD DATA

In [11]:
def load_data(model_path, model_tokenizer, file_path, downsample_ratio = 0.2, batch_size = 32, max_len = 128, mode = 'train'):
    '''
    load data and split into batches
    '''
    # tokenizer from pre-trained model
    tokenizer = model_tokenizer.from_pretrained(model_path)
    
    if model_path.startswith('xlm-roberta'):
        inputs, labels, masks = data_prepare(file_path, tokenizer, downsample_ratio = downsample_ratio, model_type='XLMROBERTA', max_len = max_len, mode=mode)
    elif model_path.startswith('bert'):
        inputs, labels, masks = data_prepare(file_path, tokenizer, downsample_ratio = downsample_ratio, model_type='BERT', max_len=max_len, mode=mode)
    
    print("Data Size:", len(inputs))
    if mode == 'train':
        data = TensorDataset(inputs, masks, labels)
        dataloader = DataLoader(data, 
                                sampler = RandomSampler(data), # Select batches randomly
                                batch_size=batch_size)
    elif mode == 'valid':
        data = TensorDataset(inputs, masks, labels)
        dataloader = DataLoader(data, 
                                sampler = SequentialSampler(data), # Select batches randomly
                                batch_size=batch_size)
    elif mode == 'predict':
        data = TensorDataset(inputs, masks)
        dataloader = DataLoader(data, 
                                sampler = SequentialSampler(data), # Select batches randomly
                                batch_size=batch_size)
    else:
        print("the type of mode should be either 'train', 'valid' or 'predict'. ")
        return
    return dataloader

### MODEL CLASS

In [12]:
# BCELOSS_VERSION
class Toxic_cls(nn.Module):
    def __init__(self, model_path, model, hidden_size):
        super(Toxic_cls, self).__init__()
        self.model_path = model_path
        self.hidden_size = hidden_size
        self.model = model.from_pretrained(model_path, 
                                           output_hidden_states=False, 
#                                            output_attentions=True, 
                                           num_labels=1
                                          )
        self.label_num = 1
        self.fc = nn.Linear(self.hidden_size, self.label_num)
    
    def forward(self, inputs, masks):
        _, pooler_output = self.model(input_ids=inputs, attention_mask = masks)
        fc_output = self.fc(pooler_output)

        return fc_output

# CROSSENTROPY_VERSION
# class Toxic_cls(nn.Module):
#     def __init__(self, model_path, model, hidden_size):
#         super(Toxic_cls, self).__init__()
#         self.model_path = model_path
#         self.hidden_size = hidden_size
#         self.model = model.from_pretrained(model_path, output_hidden_states=True, output_attentions=True)
#         self.label_num = 2
#         self.fc = nn.Linear(self.hidden_size, self.label_num)
    
#     def forward(self, inputs, masks):
#         last_hidden_state, pooler_output, hidden_states, attentions = self.model(input_ids=inputs, attention_mask = masks)
#         fc_output = self.fc(pooler_output)

#         return fc_output, attentions

### HELPER FUNCTIONS (TRAIN, EVALUATION...)

In [22]:
def train(model, iterator, optimizer, scheduler, criterion):
    
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        # Add batch to GPU
        # batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        input_ids, input_mask, labels = batch
        labels = labels.unsqueeze(1).type(torch.FloatTensor)
        outputs = model(input_ids.to(device), input_mask.to(device))
        
        optimizer.zero_grad()
        
        loss = criterion(outputs, labels.to(device))
        # delete used variables to free GPU memory
        del batch, input_ids, input_mask, labels
        loss.backward()
#         torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        if device_type == 'tpu':
            xm.optimizer_step(optimizer, barrier=True)
        else:
            optimizer.step()
        scheduler.step()
        epoch_loss += loss.cpu().item()
    
    # free GPU memory
    if device == 'cuda':
        torch.cuda.empty_cache()

    return epoch_loss / len(iterator)

In [23]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    all_pred=[]
    all_label = []
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            # Add batch to GPU
            # batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader

            input_ids, input_mask, labels = batch
            labels = labels.unsqueeze(1).type(torch.FloatTensor)
            outputs = model(input_ids.to(device), input_mask.to(device))
            
            loss = criterion(outputs, labels.to(device))

            # delete used variables to free GPU memory
            del batch, input_ids, input_mask
            epoch_loss += loss.cpu().item()

            # identify the predicted class for each example in the batch
            # probabilities, predicted = torch.max(outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            gold = labels.cpu().detach().numpy().tolist()
            predicted = outputs.cpu().detach().numpy().tolist()
            all_pred.extend(predicted)
            all_label.extend(gold)
    
#     accuracy = accuracy_score(all_label, all_pred)
#     f1score = f1_score(all_label, all_pred, average='macro') 
#     roc_auc = roc_auc_score(all_label, all_pred, average = 'macro')
    roc_auc = roc_auc_score(np.array(all_label) >= 0.5, all_pred, average = 'macro')
    return epoch_loss / len(iterator), 0, 0, roc_auc

In [15]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

## MAIN FUNCTION

In [16]:
batch_size = 32
lr = 1e-5
max_grad_norm = 1.0
epochs = 2
warmup_proportion = 0.1
downsample_ratio = 0.1
hidden_size = 768

In [17]:
# Xlm-Roberta
used_model, used_tokenizer, model_path = MODELS[2]

# Xlm-Roberta-large
# used_model, used_tokenizer, model_path = MODELS[3]

# Bert
# used_model, used_tokenizer, model_path = MODELS[0]

train_dataloader = load_data(model_path, used_tokenizer, train_toxic_comment_path, downsample_ratio = downsample_ratio,  batch_size = batch_size, mode='train')
valid_dataloader = load_data(model_path, used_tokenizer, valid_path, downsample_ratio = downsample_ratio, batch_size=batch_size, mode = 'valid')


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…


Data Size: 80000
Data Size: 8000


In [18]:
torch.cuda.empty_cache()

In [19]:
num_training_steps  = len(train_dataloader) * epochs
num_warmup_steps = num_training_steps * warmup_proportion


model = Toxic_cls(model_path, used_model, hidden_size).to(device)

### In Transformers, optimizer and schedules are instantiated like this:
# Note: AdamW is a class from the huggingface library
# the 'W' stands for 'Weight Decay"
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)
# schedules
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

# We use nn.CrossEntropyLoss() as our loss function. 
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=737.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1115590446.0, style=ProgressStyle(descr…




In [20]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 278,044,417 trainable parameters


In [24]:
# train model
# train model
loss_list = []
acc_list = []
best_auc = 0
for epoch in trange(epochs, desc="Epoch"):
    start_time = time.time()
    train_loss = train(model, train_dataloader, optimizer, scheduler, criterion)  
    val_loss, _, _, val_auc = evaluate(model, valid_dataloader, criterion)
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    # Create checkpoint at end of each epoch
    state = {
        'epoch': epoch,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict()
        }
    if best_auc < val_auc:
        best_auc = val_auc
        torch.save(state, "./BEST_MODEL.pt")
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print('\tTrain Loss: {:.4f}, Validation AUC: {:.4f}'.format(train_loss, val_auc))

Epoch:  50%|█████     | 1/2 [18:16<18:16, 1096.90s/it]

Epoch: 01 | Time: 18m 8s
	Train Loss: 0.1293, Validation AUC: 0.9211


Epoch: 100%|██████████| 2/2 [36:41<00:00, 1100.51s/it]

Epoch: 02 | Time: 18m 8s
	Train Loss: 0.0746, Validation AUC: 0.9235





### PREDICT TEST

In [26]:
batch_size = 32
# used_model, used_tokenizer, model_path = MODELS[0]

# lr = 2e-5
# max_grad_norm = 1.0
# hidden_size = 768

model_best = Toxic_cls(model_path, used_model, hidden_size).to(device)
model_best.load_state_dict(torch.load('./BEST_MODEL.pt')['state_dict'])

test_dataloader = load_data(model_path, used_tokenizer, test_path, batch_size=batch_size, mode='predict')

all_pred=[]
    
with torch.no_grad():

    for i, batch in enumerate(test_dataloader):

        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        input_ids, input_mask= batch
        outputs = model_best(input_ids, input_mask)
        # delete used variables to free GPU memory
        del batch, input_ids, input_mask
        # identify the predicted class for each example in the batch
#         probabilities, predicted = torch.max(outputs.cpu().data, 1)
        predicted = outputs.cpu().detach().numpy().tolist()
        # put all the true labels and predictions to two lists
        all_pred.extend(predicted)

Data Size: 63812


In [28]:
sub = pd.read_csv(os.path.join(data_folder, 'sample_submission.csv'))
sub['toxic'] = [1.0 if float(t[0]) >= 0 else 0.0 for t in all_pred]
sub.to_csv('./submission.csv', index=False)
sub.head()

Unnamed: 0,id,toxic
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,0.0
