In [270]:
import pandas as pd
import torch
import torch.nn as nn
from allennlp.commands.elmo import ElmoEmbedder
from allennlp.modules.elmo import Elmo, batch_to_ids
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


In [281]:
data=pd.read_csv('train.csv').fillna('')
data=data.drop(['id'],1)

In [282]:
data.shape

(159571, 7)

In [283]:
data=data.sample(n=5000)

In [11]:
elmo = ElmoEmbedder()

In [86]:
sample = elmo.embed_sentence(data['comment_text'].iloc[0]).mean(axis=0)
sample.shape

(264, 1024)

In [256]:
class get_embedding_data(Dataset):
    
    def __init__(self,comments,targets, max_len):
        self.comments = comments
        self.targets = targets
        self.max_len = max_len
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    def __len__(self):
        return len(self.comments)
    
    def __getitem__(self,i):
        comment = str(self.comments.iloc[i])
        comment = comment.split()
        comment = self.pad_tokens(comment,self.max_len)
        
        target = self.targets[i]
        vectors_in = elmo.embed_sentence(comment).mean(axis=0)
        return {'embedding':torch.FloatTensor(vectors_in),
                'targets': torch.FloatTensor(target)
               }
        
    @staticmethod
    def pad_tokens(sentence, max_len):
        if len(sentence)> max_len:
            sentence = sentence[:max_len]
        while len(sentence) < max_len:
            sentence.append('</s>')
        return sentence
        

In [284]:
x_train,x_test,y_train,y_test=train_test_split(data['comment_text'],data.loc[:,data.columns != 'comment_text'],
                                                   test_size=0.01, random_state = 5)

In [285]:
y_train = y_train.to_numpy()

In [640]:
x = get_embedding_data(x_train,y_train,50)

In [277]:
class LSTMClassifier(nn.Module):
    def __init__(self,embed_dim=1024, num_lstm=2, lstm_hidden=128, num_class=6):
        super(LSTMClassifier,self).__init__()
        self.embed_dim = embed_dim
        self.num_lstm = num_lstm
        self.lstm_hidden = lstm_hidden
        self.num_class = num_class
        self.lstm = nn.LSTM(embed_dim,lstm_hidden,num_lstm,batch_first=True)
        self.fc = nn.Linear(lstm_hidden, num_class)
        self.dropout = nn.Dropout(p=0.5)
    
    def forward(self,x):
        h0 = torch.zeros(self.num_lstm, x.size(0), self.lstm_hidden,device='cuda').requires_grad_()
        c0 = torch.zeros(self.num_lstm, x.size(0), self.lstm_hidden,device='cuda').requires_grad_()
        
        out, (hn, cn) = self.lstm(x,(h0.detach(),c0.detach()))
        out =  self.fc(out[:,-1,:])
        
        return out

In [280]:
def train_model(xtrain, ytrain, batch_size = 32, lr =.01, epoches=10):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print('training on {}'.format(device))
    model = LSTMClassifier()
    loss_func =  nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model = model.train()
    model.to(device)
    data = get_embedding_data(x_train,y_train,50)
    data_loader = DataLoader(data, batch_size=batch_size)
    for epoch in range(epoches):
        losses = []
        for batches in data_loader:
            embedding = batches['embedding'].to(device)
            target = batches['targets'].to(device)
            preds = model(embedding)
            loss = loss_func(preds,target)
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        if epoch %5 == 0:
            print('loss at epoch {} is {}'.format(epoch+1,np.mean(losses)))
    
    return model

In [287]:
model = train_model(x_train,y_train)

training on cuda
loss at epoch 1 is 0.15678890450827537
loss at epoch 6 is 0.11260779862923007


In [293]:
model.eval()

LSTMClassifier(
  (lstm): LSTM(1024, 128, num_layers=2, batch_first=True)
  (fc): Linear(in_features=128, out_features=6, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [500]:
def pad_tokens(sentence, max_len):
    if len(sentence)> max_len:
        sentence = sentence[:max_len]
    while len(sentence) < max_len:
        sentence.append('</s>')
    return sentence

In [713]:
predictions = []
for i in range(5000):
    sentence = x_train.iloc[i].split()
    sentence = pad_tokens(sentence,50)
    sentence = elmo.embed_sentence(sentence).mean(axis=0)
    sentence = torch.FloatTensor(sentence)
    sentence=sentence.unsqueeze(0)
    out=model(sentence.cuda())
    out = ((out>0.5)*1).squeeze(0)
    predictions.append(out)

In [702]:
preds = [list(tensor.cpu().numpy()) for tensor in predictions]

In [703]:
y_pred = np.array(preds)

In [704]:
y_true = y_train[:5000,:]

In [630]:
sentence.shape 

(50, 1024)

In [631]:
sentence = torch.FloatTensor(sentence)
sentence=sentence.unsqueeze(0)

In [632]:
sentence.shape

torch.Size([1, 50, 1024])

In [633]:
out=model(sentence.cuda())

In [706]:
((out>0.5)*1).squeeze(0)

tensor([0, 0, 0, 0, 0, 0], device='cuda:0')

In [709]:
np.unique(y_true,return_counts=True)

(array([0, 1], dtype=int64), array([28550,  1150], dtype=int64))

In [710]:
np.unique(y_pred,return_counts=True)

(array([0, 1], dtype=int64), array([29285,   415], dtype=int64))

In [711]:
import numpy as np
from sklearn.metrics import hamming_loss, accuracy_score 


print("accuracy_score:", accuracy_score(y_true.reshape(-1), y_pred.reshape(-1)))
print("Hamming_loss:", hamming_loss(y_true, y_pred))


accuracy_score: 0.9681818181818181
Hamming_loss: 0.031818181818181815


In [None]:
y_