In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

##https://www.kaggle.com/liananapalkova/simply-about-word2vec

/kaggle/input/glove840b300dtxt/glove.840B.300d.txt
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv


In [2]:
import time
import re
import sys
import copy
import torch 
import numpy as np
from scipy.sparse import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import pyarrow as pa

from keras.preprocessing import text, sequence

import torch.nn as nn
from torch.optim import lr_scheduler
import torch.nn.functional as F
from torchvision import datasets, models, transforms
from torch.utils.data import Dataset,DataLoader


import pandas as pd

if not sys.warnoptions:    
    import warnings
    warnings.simplefilter("ignore")

Using TensorFlow backend.


In [3]:
sample_submission = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv')
test_labels = pd.read_csv('//kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv')
train = pd.read_csv('//kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv')
test = pd.read_csv('//kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv')

In [4]:
#train['comment_text'][train['comment_text'].apply(len)==5000]

In [5]:
#for i in train[156437:156438]['comment_text']: print(i)

In [6]:
def clean(x):
    x = re.sub(r'[^a-zA-Z0-9]',' ',x)
    x = ' '.join(x.split())
    x.lower()
    return x

In [7]:
test['comment_text'] = test['comment_text'].apply(clean)
train['comment_text'] = train['comment_text'].apply(clean)

In [8]:
## Feature engineering to prepare inputs for BERT....

Y = train[['toxic','severe_toxic','obscene','threat','insult','identity_hate']].values
X = train['comment_text'].values


X_train, X_test, y_train, y_test = train_test_split( X, Y, test_size=0.33, random_state=42)

In [9]:
print('train_x shape is {}' .format({X_train.shape}))
print('test_x shape is {}' .format({X_test.shape}))
print('train_y shape is {}' .format({y_train.shape}))

train_x shape is {(106912,)}
test_x shape is {(52659,)}
train_y shape is {(106912, 6)}


In [10]:
max_features=100000 #how many unique words to use (i.e num rows in embedding vector)
maxlen=150    # max number of words in a page content to use
embed_size=300  # how big is each word vector

In [11]:
tok=text.Tokenizer(num_words=max_features,lower=True)

tok.fit_on_texts(list(X_train))


X_train=tok.texts_to_sequences(X_train)
X_test=tok.texts_to_sequences(X_test)

In [12]:
X_train=sequence.pad_sequences(X_train,maxlen=maxlen)
X_test=sequence.pad_sequences(X_test,maxlen=maxlen)

In [13]:
EMBEDDING_FILE = '/kaggle/input/glove840b300dtxt/glove.840B.300d.txt'

In [14]:
embeddings_index = {}
with open(EMBEDDING_FILE,encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [15]:
word_index = tok.word_index
#prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [16]:
np.shape(embedding_matrix)

(100000, 300)

In [17]:
class text_dataset(Dataset):
    def __init__(self,x,y, transform=None):
        
        self.x = x
        self.y = y
        self.transform = transform
        
    def __getitem__(self,index):
        sentiments = self.y[index]
        text = torch.LongTensor(self.x[index])               
        return text, sentiments
    
    def __len__(self):
        return len(self.x)

In [18]:
batch_size = 32


training_dataset = text_dataset(X_train,y_train)

test_dataset = text_dataset(X_test,y_test)

dataloaders_dict = {'train': torch.utils.data.DataLoader(training_dataset, batch_size=batch_size, shuffle=False),
                   'val':torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
                   }
dataset_sizes = {'train':len(X_train),
                'val':len(X_test)}

In [19]:
class Attention(nn.Module):
    def __init__(self,feature_dim,step_dim,bias=True,**kwargs):
        super(Attention,self).__init__(**kwargs)
        self.supports_masking = True
        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0 
        
        weight = torch.zeros(feature_dim,1)
        nn.init.kaiming_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self,x,mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim
        
        eij = torch.mm(x.contiguous().view(-1,feature_dim),
                      self.weight).view(-1,step_dim)
        if self.bias:
            eij = eij + self.b 
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask
        a = a / (torch.sum(a, 1 , keepdim = True) + 1e-10)
        
        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input,1)

In [38]:
class CNN_LSTM_ATT(nn.Module):
    
    def __init__(self):
        super(CNN_LSTM_ATT, self).__init__()
        filter_sizes = [1,2,3,5]
        num_filters = 36
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
        self.lstm1 = nn.LSTM(593, 128 , bidirectional = True, batch_first  = True)
        self.lstm2 = nn.GRU(128*2,64,bidirectional=True,batch_first = True)
        self.attention_layer = Attention(128, num_filters)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(128,128)
        self.fc2 = nn.Linear(128,6)
        self.relu = nn.ReLU()
     
    def forward(self, x):
        x = self.embedding(x)  
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = torch.cat(x, 2) 
        #for i in x: print('shape after conv {}'.format(i.shape))
        h_lstm, _ = self.lstm1(x)
        h_lstm, _ = self.lstm2(h_lstm)
        h_lstm_atten = self.attention_layer(h_lstm)
        #x = self.dropout(x) 
        logit = self.relu(self.fc1(h_lstm_atten))
        logit = self.fc2(logit)
        return logit 


In [39]:
class CNN_Text_ATT(nn.Module):
    
    def __init__(self):
        super(CNN_Text_ATT, self).__init__()
        filter_sizes = [1,2,3,5]
        num_filters = 36
        self.embedding = nn.Embedding(max_features, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
        self.attention_layer = Attention(593, num_filters)
        self.dropout = nn.Dropout(0.1)
        self.fc1 = nn.Linear(593,128)
        self.fc2 = nn.Linear(128,6)
        self.relu = nn.ReLU()
     
    def forward(self, x):
        x = self.embedding(x)  
        x = x.unsqueeze(1)  
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] 
        x = torch.cat(x, 2) 
        #for i in x: print('shape after conv {}'.format(i.shape))
        x = self.attention_layer(x)
        x = self.dropout(x) 
        logit = self.relu(self.fc1(x))
        logit = self.fc2(logit)
        return logit 

In [40]:
def accuracy_thresh(y_pred, y_true, thresh:float=0.4, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: y_pred = y_pred.sigmoid()
#     return ((y_pred>thresh)==y_true.byte()).float().mean().item()
    return np.mean(((y_pred>thresh)==y_true.byte()).float().cpu().numpy(), axis=1).sum()

In [41]:
#model = CNN_Text_ATT()
model = CNN_LSTM_ATT()

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model.to(device)

lrlast = .01 #.001
lrmain = 3e-5

optim1 = torch.optim.Adam(model.parameters(),lrlast)

optimizer_ft = optim1

exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=3, gamma=0.1)
criterion = nn.BCEWithLogitsLoss()

print(device)

cuda:0


In [42]:
def train_model(model, criterion, optimizer, scheduler,num_epochs,attention):
    model.train()
    since = time.time()
    print('starting')
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = 100

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            F
            beta_score_accuracy = 0.0
            
            micro_roc_auc_acc = 0.0
            
            
            # Iterate over data.
            for inputs, sentiments in dataloaders_dict[phase]:
                
                inputs = inputs.to(device) 

                sentiments = sentiments.to(device)
            
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    #outputs = torch.sigmoid(outputs)
                    #print(outputs)
                    loss = criterion(outputs,sentiments.float())
                    
                    if phase == 'train':
                        
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                
                micro_roc_auc_acc +=  accuracy_thresh(outputs.view(-1,6),sentiments.view(-1,6))
                
                #print(micro_roc_auc_acc)

                
            epoch_loss = running_loss / dataset_sizes[phase]

            
            epoch_micro_roc_acc = micro_roc_auc_acc / dataset_sizes[phase]

            print('{} total loss: {:.4f} '.format(phase,epoch_loss ))
            print('{} micro_roc_auc_acc: {:.4f}'.format( phase, epoch_micro_roc_acc))

            if phase == 'val' and epoch_loss < best_loss:
                print('saving with loss of {}'.format(epoch_loss),
                      'improved over previous {}'.format(best_loss))
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(),'distilbert_model_weights.pth')

        print()
        time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(float(best_loss)))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model
 
print('done')

done


In [43]:
model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,10,Attention(491, 36))
#model_ft1 = train_model(model, criterion, optimizer_ft, exp_lr_scheduler,10)

starting
Epoch 1/10
----------
train total loss: 0.0676 
train micro_roc_auc_acc: 0.9761
val total loss: 0.0602 
val micro_roc_auc_acc: 0.9768
saving with loss of 0.060239845689204104 improved over previous 100

Epoch 2/10
----------
train total loss: 0.0556 
train micro_roc_auc_acc: 0.9795
val total loss: 0.0532 
val micro_roc_auc_acc: 0.9807
saving with loss of 0.053170775624067815 improved over previous 0.060239845689204104

Epoch 3/10
----------
train total loss: 0.0489 
train micro_roc_auc_acc: 0.9813
val total loss: 0.0518 
val micro_roc_auc_acc: 0.9808
saving with loss of 0.05175525527193139 improved over previous 0.053170775624067815

Epoch 4/10
----------
train total loss: 0.0466 
train micro_roc_auc_acc: 0.9820
val total loss: 0.0513 
val micro_roc_auc_acc: 0.9807
saving with loss of 0.051348642666484895 improved over previous 0.05175525527193139

Epoch 5/10
----------
train total loss: 0.0448 
train micro_roc_auc_acc: 0.9826
val total loss: 0.0514 
val micro_roc_auc_acc: 0.9

### Make predictions

In [44]:
x_test = test['comment_text']
x_test=tok.texts_to_sequences(x_test)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)
y_test = np.zeros(x_test.shape[0]*6).reshape(x_test.shape[0],6)

In [45]:
test_dataset = text_dataset(x_test,y_test)
prediction_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

def preds(model,test_loader):
    predictions = []
    for inputs, sentiment in test_loader:
        inputs = inputs.to(device) 
        sentiment = sentiment.to(device)
        with torch.no_grad():
            outputs = model(inputs)
            outputs = torch.sigmoid(outputs)
            predictions.append(outputs.cpu().detach().numpy().tolist())
    return predictions

In [46]:
predictions = preds(model=model_ft1,test_loader=prediction_dataloader)
predictions = np.array(predictions)[:,0]

In [47]:
submission = pd.DataFrame(predictions,columns=['toxic','severe_toxic','obscene','threat','insult','identity_hate'])
test[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]=submission
final_sub = test[['id','toxic','severe_toxic','obscene','threat','insult','identity_hate']]
final_sub.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.990284,0.2254576,0.951126,0.047459,0.832735,0.164205
1,0000247867823ef7,0.000825,9.129903e-08,0.000117,1.3e-05,7.6e-05,1.2e-05
2,00013b17ad220c46,0.000536,5.443849e-08,8.9e-05,8e-06,5e-05,7e-06
3,00017563c3f7919a,0.00019,1.518447e-08,3.5e-05,3e-06,1.7e-05,3e-06
4,00017695ad8997eb,0.000853,1.508437e-07,0.000123,1.8e-05,8.3e-05,1.6e-05


In [48]:
final_sub.to_csv('submissions.csv',index=False)#
final_sub.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.990284,0.2254576,0.951126,0.047459,0.832735,0.164205
1,0000247867823ef7,0.000825,9.129903e-08,0.000117,1.3e-05,7.6e-05,1.2e-05
2,00013b17ad220c46,0.000536,5.443849e-08,8.9e-05,8e-06,5e-05,7e-06
3,00017563c3f7919a,0.00019,1.518447e-08,3.5e-05,3e-06,1.7e-05,3e-06
4,00017695ad8997eb,0.000853,1.508437e-07,0.000123,1.8e-05,8.3e-05,1.6e-05


## Play ground

#### CONV WITH attention

In [None]:
filter_sizes = [1,2,3,5]
#filter_sizes = [5]
num_filters = 36
embedding = nn.Embedding(max_features, embed_size)
#embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
embedding.weight.requires_grad = False
#lstm = nn.LSTM(embed_size, 128 , bidirectional = True, batch_first  = True)
lstm = nn.LSTM(embed_size, 128 , bidirectional = True, batch_first  = True)
lstm2 = nn.GRU(128*2,64,bidirectional=True,batch_first = True)
convs2 = nn.Conv2d(1, num_filters, (5, embed_size))
convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
attention_layer = Attention(593,36)
dropout = nn.Dropout(0.1)
#fc1 = nn.Linear(len(filter_sizes)*num_filters, 6)
fc1 = nn.Linear(593, 6)

In [None]:
x_play = train['comment_text'].values
X_test_train=tok.texts_to_sequences(x_play[:10])
X_test_train = sequence.pad_sequences(X_test_train,maxlen=maxlen)
x= torch.LongTensor(X_test_train)

x = embedding(x)
x = x.unsqueeze(1) 
#x = [F.relu(conv(x)).squeeze(3) for conv in convs1]
x= [F.relu(conv(x)).squeeze(3) for conv in convs1]
for i in x: print('shape after conv {}'.format(i.shape))

x = torch.cat(x, 2) 
print('shape after concat {}'.format(x.shape))

x=attention_layer(x)
print('shape after attention {}'.format(x.shape))
logits = fc1(x)

#### CONV WITH LSTM

In [None]:
filter_sizes = [1,2,3,5]
#filter_sizes = [5]
num_filters = 36
embedding = nn.Embedding(max_features, embed_size)
#embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
embedding.weight.requires_grad = False
#lstm = nn.LSTM(embed_size, 128 , bidirectional = True, batch_first  = True)
lstm = nn.LSTM(593, 128 , bidirectional = True, batch_first  = True)
lstm2 = nn.GRU(128*2,64,bidirectional=True,batch_first = True)
convs2 = nn.Conv2d(1, num_filters, (5, embed_size))
convs1 = nn.ModuleList([nn.Conv2d(1, num_filters, (K, embed_size)) for K in filter_sizes])
attention_layer = Attention(128,num_filters)
dropout = nn.Dropout(0.1)
relu = nn.ReLU()
#fc1 = nn.Linear(len(filter_sizes)*num_filters, 6)
fc1 = nn.Linear(128, 128)
relu = nn.ReLU()
out = nn.Linear(128,6)

In [None]:
x_play = train['comment_text'].values
X_test_train=tok.texts_to_sequences(x_play[:10])
X_test_train = sequence.pad_sequences(X_test_train,maxlen=maxlen)
x= torch.LongTensor(X_test_train)
x = embedding(x)
x = x.unsqueeze(1) 
x= [F.relu(conv(x)).squeeze(3) for conv in convs1]
#for i in x: print('shape after conv {}'.format(i.shape))
x = torch.cat(x, 2) 
h_lstm, _ =lstm(x)
h_lstm, _ = lstm2(h_lstm)
h_lstm_atten = attention_layer(h_lstm)
print(h_lstm_atten.shape)
conc = relu(fc1(h_lstm_atten))
out = out(conc)
out.shape
