In [None]:
!pip install mlflow

In [None]:
import os
from datetime import datetime
from time import time, sleep
import torch
import torch.nn as nn
import pickle
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import DistilBertForSequenceClassification
from time import time
from sklearn.metrics import mean_squared_error
from mlflow import log_params, log_metric, set_experiment, start_run, end_run
from datetime import datetime

In [None]:
torch.manual_seed(53)
np.random.seed(53)

In [None]:
data_path = '../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv'
batch_size = 32
epochs = 5
lr = 3e-05
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
timestamp = int(datetime.timestamp(datetime.now()))

In [None]:
# Set Mlflow !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

In [None]:
set_experiment("Baseline-Jigsaw")
start_run(run_name='baseline-{}'.format(timestamp))
log_params({
    'batch_size': batch_size,
    'epochs': epochs,
    'learning_rate': lr
           })

In [None]:
# Preparing data

In [None]:
class Dataset(Dataset):
    
    def __init__(self, df): 
        self.df = df
            
    def __len__(self):
        return self.df.shape[0]
        
    def __getitem__(self, idx):
        rate = self.df.iloc[idx,2:].sum()
        return self.df.iloc[idx, 1], rate

In [None]:
def get_dataset(batch_size=32, source=data_path):  
    df = pd.read_csv(source)
    train, valid = train_test_split(df, test_size=0.2)
    
    train_dataset = Dataset(df = train)
    valid_dataset = Dataset(df = valid)
    
    train_dataloader = DataLoader(train_dataset,
                            batch_size=batch_size,
                            drop_last=True,
                            num_workers=0,
                            pin_memory=True)
    valid_dataloader = DataLoader(valid_dataset,
                            batch_size=batch_size,
                            drop_last=True,
                            num_workers=0,
                            pin_memory=True)
    
    return train_dataloader, valid_dataloader

In [None]:
train_dataloader, valid_dataloader = get_dataset(source=data_path,
                                                 batch_size=batch_size)

In [None]:
# Preparing model

In [None]:
class RegTransformer(nn.Module):
    def __init__(self, my_pretrained_model):
        super(RegTransformer, self).__init__()
        self.pretrained = my_pretrained_model
        self.pre_reg = nn.Linear(in_features=768, out_features=768, bias=True)
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.reg = nn.Linear(in_features=768, out_features=1, bias=True)
        
    def forward(self, input_ids=None, attention_mask=None,  token_type_ids=None, labels=None):
        x = self.pretrained(input_ids, attention_mask=attention_mask)
        x = x[0][:, 0]
        x = self.pre_reg(x)
        x = self.act(x)
        x = self.dropout(x)
        y = self.reg(x)
        
        return y

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
pretrain_model = DistilBertModel.from_pretrained('distilbert-base-cased')

In [None]:
model = RegTransformer(my_pretrained_model = pretrain_model)
model = model.to(device)

In [None]:
# Training

In [None]:
criterion = nn.MSELoss()  
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

In [None]:
torch.cuda.empty_cache()
step = 0
total_step = len(train_dataloader)
for epoch in range(epochs):
    start_epoch = time()
    for i, (texts, labels) in enumerate(train_dataloader):
        start_step = time()
        
        # prepare data
        inputs = tokenizer(texts, return_tensors="pt",
                          padding=True, truncation=True)
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # forward pass
        outputs = model(**inputs)
        loss = criterion(outputs.view(1,-1)[0], labels.float())
        
        # backward pass and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Mlflow recording
        step += 1
        log_metric('MSE_loss', loss.item(), step)
        
        print('Training - Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Time training step {:.3f}'.format(epoch + 1, epochs, i + 1, total_step, loss.item(), time()-start_step))
        #break
        
    # Evaluating model on validation data
    mse = 0
    model.eval() 
    with torch.no_grad():
        for i, (texts, labels) in enumerate(valid_dataloader):
            start_step = time()
            # prepare data
            inputs = tokenizer(texts, return_tensors="pt",
                               padding=True, truncation=True)
            inputs = inputs.to(device)

            # forward pass
            outputs = model(**inputs)

            # calculate RMSE
            out_val = outputs.view(1,-1)[0].cpu().detach().numpy()
            lab_val = labels.float().cpu().detach().numpy()
            mse += mean_squared_error(out_val, lab_val)
            print('Validating Step [{}/{}], MSE: {:.4f}, Time validation step {:.3f}'.format(i + 1, len(valid_dataloader), mse, time()-start_step))
            #break
            
        rmse = np.sqrt(mse/len(valid_dataloader))
        
    # Mlflow recording
    log_metric('Validation_RMSE', rmse, step)
    
    torch.save(model.state_dict(), 'model-{}-{}'.format(epoch + 1, timestamp))
        
    print('EPOCH [{}/{}], Loss: {:.4f}, RMSE: {:.4f}, Time epoch {:.3f}'.format(epoch + 1, epochs, loss.item(), rmse, time()-start_epoch))
    #break


In [None]:
end_run()