In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
%load_ext autoreload
%autoreload 2

In [3]:
import os, sys
lib_path = os.path.abspath(os.path.join('../classifier/'))
sys.path.append(lib_path)
import classifier

In [4]:
import nltk
import torch
import torchtext.vocab as vocab
import torch.optim as optim
import torch.autograd as autograd
import torch.nn as nn
import torch.utils.data as torchdata

In [5]:
import pickle

In [7]:
glove = vocab.GloVe(name='6B', dim=50)

# preprocess data

In [8]:
subset_train = 1

In [9]:
datapath = '../data/'

In [10]:
train = pd.read_csv(datapath + 'train.csv')
test = pd.read_csv(datapath + 'test.csv')
sub = pd.read_csv(datapath + 'sample_submission.csv')
train = train.iloc[:int(len(train) * subset_train), :]
test = test.iloc[:int(len(test) * subset_train), :]
test = test.fillna('unknown')
comment_types = train.columns[2:].values

In [None]:
lang = classifier.Language()

In [None]:
lang.process_sentences(train.comment_text.values,
                      test_sentences=list(test.comment_text.values))

In [None]:
pickle.dump(lang, open(datapath + 'lang.pk', 'wb'))

In [11]:
lang = pickle.load(open(datapath + 'lang.pk', 'rb'))

In [12]:
init_embedding = classifier.create_init_embedding(lang, glove)

In [13]:
model = classifier.NER_LSTM(lang.vocab_size,
                           embedding_dim=50,
                           hidden_dim=64,
                            linear_dim=64, 
                           n_layers=1,
                            dropout=.1,
                            gpu=True,
                           init_embedding=init_embedding,
                           bidirectional=True)

In [14]:
loss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
#model = model.cuda()

In [15]:
trainer = classifier.Trainer(model,
                             optimizer,
                             loss,
                             lang.encoded_sentences,
                             train.loc[:, comment_types].values.astype(int),
                             batch_size = 32,
                            val_size=.25)

In [16]:
trainer.train(2)

Epoch: 0    loss: 0.1320, val: 0.0543
Epoch: 1    loss: 0.1124, val: 0.0517


In [None]:
trainer.train(1)

In [21]:
results = np.zeros_like(sub.loc[:, comment_types].values)

In [23]:
for _ in range(10):
    model = classifier.NER_LSTM(lang.vocab_size,
                           embedding_dim=50,
                           hidden_dim=64,
                            linear_dim=64, 
                           n_layers=1,
                            dropout=.1,
                            gpu=True,
                           init_embedding=init_embedding,
                           bidirectional=True) 
    loss = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    trainer = classifier.Trainer(model,
                             optimizer,
                             loss,
                             lang.encoded_sentences,
                             train.loc[:, comment_types].values.astype(int),
                             batch_size = 32,
                            val_size=0.01)
    trainer.train(2)
    final = classifier.make_submission(lang, model, sub, 'subm.csv', datapath,
                          comment_types)
    results += final

Epoch: 0    loss: 0.1277, val: 0.0441
Epoch: 1    loss: 0.1116, val: 0.0411
predicting...
saving file...
done
Epoch: 0    loss: 0.1274, val: 0.0523
Epoch: 1    loss: 0.1108, val: 0.0496
predicting...
saving file...
done
Epoch: 0    loss: 0.1276, val: 0.0497
Epoch: 1    loss: 0.1120, val: 0.0466
predicting...
saving file...
done
Epoch: 0    loss: 0.1289, val: 0.0511
Epoch: 1    loss: 0.1120, val: 0.0506
predicting...
saving file...
done
Epoch: 0    loss: 0.1277, val: 0.0446
Epoch: 1    loss: 0.1115, val: 0.0400
predicting...
saving file...
done
Epoch: 0    loss: 0.1277, val: 0.0621
Epoch: 1    loss: 0.1115, val: 0.0605
predicting...
saving file...
done
Epoch: 0    loss: 0.1283, val: 0.0674
Epoch: 1    loss: 0.1113, val: 0.0653
predicting...
saving file...
done
Epoch: 0    loss: 0.1282, val: 0.0476
Epoch: 1    loss: 0.1119, val: 0.0457
predicting...
saving file...
done
Epoch: 0    loss: 0.1279, val: 0.0448
Epoch: 1    loss: 0.1111, val: 0.0421
predicting...
saving file...
done
Epoch: 0  

In [24]:
results = results/10
sub.loc[:, comment_types] = results
sub.to_csv(datapath + 'ensemble.csv', index = False)

In [None]:
X_test = lang.encoded_test_sentences
y_empty = torch.zeros(X_test.shape[0])
dataset = torchdata.TensorDataset(torch.LongTensor(X_test.astype(int)), y_empty)
loader = torchdata.DataLoader(dataset, batch_size = 256)
preds = []
model.eval()
for X, _ in loader:
    X = autograd.Variable(X).cuda()
    log_probs = model(X).cpu().data.numpy()
    preds.append(log_probs)
final = np.vstack(preds)

In [None]:
sub.loc[:, comment_types] = final
sub.to_csv(datapath + 'submission.csv', index = False)

In [20]:
classifier.make_submission(lang, model, sub, 'subm.csv', datapath,
                          comment_types)

predicting...
saving file...
done
