In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os, sys
lib_path = os.path.abspath(os.path.join('../classifier/'))
sys.path.append(lib_path)
import classifier

In [4]:
import nltk
import torch
import torchtext.vocab as vocab
import torch.optim as optim
import torch.autograd as autograd
import torch.nn as nn
import torch.utils.data as torchdata

In [5]:
import pickle

In [6]:
glove = vocab.GloVe(name='6B', dim=50)

# preprocess data

In [7]:
subset_train = .1

In [8]:
datapath = '../data/'

In [9]:
train = pd.read_csv(datapath + 'train.csv')
test = pd.read_csv(datapath + 'test.csv')
sub = pd.read_csv(datapath + 'sample_submission.csv')
train = train.iloc[:int(len(train) * subset_train), :]
test = test.iloc[:int(len(test) * subset_train), :]
test = test.fillna('unknown')
comment_types = train.columns[2:].values

In [10]:
lang = classifier.Language()

In [11]:
lang.process_sentences(train.comment_text.values,
                      test_sentences=list(test.comment_text.values))

In [12]:
pickle.dump(lang, open(datapath + 'lang.pk', 'wb'))

In [10]:
lang = pickle.load(open(datapath + 'lang.pk', 'rb'))

In [11]:
init_embedding = classifier.create_init_embedding(lang, glove)

In [12]:
model = classifier.CharLSTM(lang.vocab_size,
                            char_size=127,
                           embedding_dim=50,
                           hidden_dim=64,
                            linear_dim=64, 
                           n_layers=1,
                           init_embedding=init_embedding,
                            char_embedding_dim=15,
                            char_hidden_dim = 32, 
                            char_n_layers=1,
                            dropout=.1,
                            gpu=False,
                           bidirectional=True,
                           char_bidirectional=True)

In [13]:
loss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
#model = model.cuda()

In [18]:
trainer = classifier.CharTrainer(model,
                             optimizer,
                             loss,
                             lang.encoded_sentences,
                                 lang.encoded_chars,
                             train.loc[:, comment_types].values.astype(int),
                             batch_size = 512,
                            val_size=.25)

In [None]:
trainer.train(2)

Epoch: 0    

In [None]:
trainer.train(1

In [None]:
results = np.zeros_like(sub.loc[:, comment_types].values)

In [None]:
for _ in range(10):
    model = classifier.NER_LSTM(lang.vocab_size,
                           embedding_dim=50,
                           hidden_dim=64,
                            linear_dim=64, 
                           n_layers=1,
                            dropout=.1,
                            gpu=True,
                           init_embedding=init_embedding,
                           bidirectional=True) 
    loss = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    trainer = classifier.Trainer(model,
                             optimizer,
                             loss,
                             lang.encoded_sentences,
                             train.loc[:, comment_types].values.astype(int),
                             batch_size = 32,
                            val_size=0.01)
    trainer.train(2)
    final = classifier.make_submission(lang, model, sub, 'subm.csv', datapath,
                          comment_types)
    results += final

In [None]:
results = results/10
sub.loc[:, comment_types] = results
sub.to_csv(datapath + 'ensemble.csv', index = False)

In [None]:
X_test = lang.encoded_test_sentences
y_empty = torch.zeros(X_test.shape[0])
dataset = torchdata.TensorDataset(torch.LongTensor(X_test.astype(int)), y_empty)
loader = torchdata.DataLoader(dataset, batch_size = 256)
preds = []
model.eval()
for X, _ in loader:
    X = autograd.Variable(X).cuda()
    log_probs = model(X).cpu().data.numpy()
    preds.append(log_probs)
final = np.vstack(preds)

In [None]:
sub.loc[:, comment_types] = final
sub.to_csv(datapath + 'submission.csv', index = False)

In [None]:
classifier.make_submission(lang, model, sub, 'subm.csv', datapath,
                          comment_types)