### HW1 Text Classifier
---

#### Setting Up

In [1]:
!wget http://phontron.com/data/topicclass-v1.tar.gz

--2020-02-04 20:02:43--  http://phontron.com/data/topicclass-v1.tar.gz
Resolving phontron.com (phontron.com)... 208.113.196.149
Connecting to phontron.com (phontron.com)|208.113.196.149|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15665160 (15M) [application/gzip]
Saving to: ‘topicclass-v1.tar.gz’


2020-02-04 20:02:44 (45.2 MB/s) - ‘topicclass-v1.tar.gz’ saved [15665160/15665160]



In [2]:
!tar -xvzf topicclass-v1.tar.gz topicclass

topicclass/
topicclass/topicclass_valid.txt
topicclass/topicclass_test.txt
topicclass/topicclass_train.txt


#### Data Preprocessing
---

In [0]:
def read_data(path):
  with open(path, "r") as f:
    data = f.readlines()
  labels, text = zip(*map(lambda x: x.split("|||"), data))
  
  labels = map(lambda x: x.strip("\n").strip().lower(), labels)
  labels = list(map(lambda x: "media and drama" if "media and darama" in x else x, labels))
  text = list(map(lambda x: x.strip("\n").strip().lower(), text))
  return text, labels

In [0]:
train_x, train_y = read_data("topicclass/topicclass_train.txt")
valid_x, valid_y = read_data("topicclass/topicclass_valid.txt")
test_x, test_y = read_data("topicclass/topicclass_test.txt")

In [0]:
import re
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
word_tokenizer = TreebankWordTokenizer()

def clean_string(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`\-\_]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\\", " ", string)
    string = re.sub(r"\s+", " ", string)
    return string.strip()

def tokenize(string):
    # return list(map(lemmatizer.lemmatize, word_tokenizer.tokenize(string)))
    return  word_tokenizer.tokenize(string)

def preprocess(texts):
    texts = map(clean_string, texts)
    texts = map(tokenize, texts)
    return list(texts)

In [6]:
%time train_x = preprocess(train_x)
%time valid_x = preprocess(valid_x)
%time test_x = preprocess(test_x)

CPU times: user 35.3 s, sys: 222 ms, total: 35.5 s
Wall time: 35.5 s
CPU times: user 84.9 ms, sys: 0 ns, total: 84.9 ms
Wall time: 85.1 ms
CPU times: user 92.8 ms, sys: 0 ns, total: 92.8 ms
Wall time: 92.8 ms


In [7]:
import pandas as pd
len_train = list(map(len, train_x))
pd.Series(len_train).quantile(1)

66.0

In [0]:
import itertools
import pandas as pd
import numpy as np

class Vocab(object):
  
  def __init__(self, L):
    if isinstance(L[0], list):
      tokens = list(itertools.chain(*L))
      self.token_counts = pd.Series(tokens).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = ["unk"] + self.token_counts.index.to_list()
    else:
      tokens = self.token_counts = pd.Series(L).value_counts().to_frame().sort_index(ascending=True)
      self.vocab = self.token_counts.index.to_list()
    self.w2i = dict(zip(self.vocab, range(len(self.vocab))))
    self.i2w = dict(zip(range(len(self.vocab)), self.vocab))

  def map_words2index(self, L):
    return list(map(lambda x: self.w2i[x] if x in self.w2i else self.w2i['unk'], L))

  def map_index2words(self, L):
    return list(map(lambda x: self.i2w[x], L))

  def map_dataset_words2index(self, L):
    return np.array(list(map(self.map_words2index, L)))

  def map_dataset_index2words(self, L):
    return np.array(list(map(self.map_index2words, L)))

In [9]:
%time vocab = Vocab(train_x + valid_x)
%time train_x = vocab.map_dataset_words2index(train_x)
%time valid_x = vocab.map_dataset_words2index(valid_x)
%time test_x = vocab.map_dataset_words2index(test_x)

CPU times: user 1.71 s, sys: 65 ms, total: 1.77 s
Wall time: 1.78 s
CPU times: user 3.07 s, sys: 32 ms, total: 3.1 s
Wall time: 3.1 s
CPU times: user 7.53 ms, sys: 9 µs, total: 7.54 ms
Wall time: 7.51 ms
CPU times: user 7.22 ms, sys: 153 µs, total: 7.38 ms
Wall time: 7.35 ms


In [10]:
print(len(vocab.vocab))
train_x_ = vocab.map_dataset_index2words(train_x)
" ".join(train_x_[0])

113157


'several of these rights regulate pre - trial procedure access to a non - excessive bail , the right to indictment by a grand jury , the right to an information ( charging document ) , the right to a speedy trial , and the right to be tried in a specific venue'

In [11]:
label_vocab = Vocab(train_y + valid_y)
label_vocab.w2i

{'agriculture, food and drink': 0,
 'art and architecture': 1,
 'engineering and technology': 2,
 'geography and places': 3,
 'history': 4,
 'language and literature': 5,
 'mathematics': 6,
 'media and drama': 7,
 'miscellaneous': 8,
 'music': 9,
 'natural sciences': 10,
 'philosophy and religion': 11,
 'social sciences and society': 12,
 'sports and recreation': 13,
 'video games': 14,
 'warfare': 15}

In [0]:
train_y = label_vocab.map_words2index(train_y)
valid_y = label_vocab.map_words2index(valid_y)

In [13]:
# from keras.utils import to_categorical

# train_y = to_categorical(train_y, num_classes=17)
# train_y = list(map(list, train_y))
# valid_y = to_categorical(valid_y, num_classes=17)
# valid_y = list(map(list, valid_y))
valid_y[:10]

[13, 13, 13, 7, 9, 9, 9, 7, 12, 4]

#### Dataset
---

In [0]:
import torch
from torch import nn, LongTensor
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from IPython.core.debugger import set_trace


In [0]:
class MyDataset(Dataset):

  def __init__(self, X, Y=None):
    self.X = X
    self.Y = Y

  def __len__(self):
    return len(self.X)

  def __getitem__(self, idx):
    if self.Y is not None:
      return (self.X[idx], self.Y[idx])
    return (self.X[idx], None)

def pad(seq, seq_lengths, pad_after=True):
  max_seq_len = max(seq_lengths)
  seq_tensor = Variable(torch.zeros((len(seq), max_seq_len))).long()
  # pad input tensor
  for idx, seq in enumerate(seq):
    seq_len = seq_lengths[idx]
    if pad_after:
      seq_tensor[idx, :seq_len] = LongTensor(np.asarray(seq).astype(int))
    else: 
      # pad before
      seq_tensor[idx, max_seq_len-seq_len:] = LongTensor(np.asarray(seq).astype(int))
  return seq_tensor

def batchify(data):
  X, Y = tuple(map(list, zip(*data)))
  seq_lengths = LongTensor([len(x) for x in X])
  X = pad(X, seq_lengths, pad_after=True)
  Y = LongTensor(Y)
  return X, Y

def batchify_test(data):
  X, Y = tuple(map(list, zip(*data)))
  seq_lengths = LongTensor([len(x) for x in X])
  X = pad(X, seq_lengths, pad_after=True)
  return X, Y


train = MyDataset(train_x, train_y)
valid = MyDataset(valid_x, valid_y)
test = MyDataset(test_x)

In [0]:
train_loader = DataLoader(train, batch_size=64, shuffle=True, collate_fn=batchify)
valid_loader = DataLoader(valid, batch_size=64, shuffle=False, collate_fn=batchify)
test_loader = DataLoader(test, batch_size=64, shuffle=False, collate_fn=batchify_test)

#### Model Fitting
---

In [0]:
from torch import nn, LongTensor, Tensor
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Dataset
import torch.optim as optim
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

In [0]:
import time
def accuracy(preds, y):
  return (np.array(preds) == np.array(y)).astype(int).mean()


def train_epoch(epoch, model, optimizer, criterion):
  model.train()
  train_loss, n_data = 0, 0
  start = time.time()
  preds = []
  labels = []
  for i, (x, y) in enumerate(train_loader):
    n_data += x.size()[0]
    labels.extend(y.tolist())
    if is_cuda: x, y = x.cuda(), y.cuda()
    optimizer.zero_grad()
    out = model(x)
    preds.extend(out.argmax(axis=1).tolist())
    loss = criterion(out, y)
    loss.backward()
    if grad_clip: torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
    optimizer.step()
    train_loss += loss
    if i % print_iter == print_iter - 1:
      model, valid_preds, valid_labels, valid_loss = validate(model, criterion)
      print("""epoch {} - batch [{}/{}] - train loss: {:.2f} - acc: {:.3f} - valid loss : {:.2f} - acc : {:.3f} time taken: {:.2f}""".format(epoch, i, 
            len(train_loader), train_loss/(i+1),
            accuracy(preds, labels), valid_loss, accuracy(valid_preds, valid_labels),
            time.time()-start), flush=True)
      
      model.train()
      start = time.time()
      train_loss = 0

  # end of epoch
  model, valid_preds, valid_labels, valid_loss = validate(model, criterion)
  print("""epoch {} - batch [{}/{}] - train loss: {:.2f} - acc: {:.3f} - valid loss : {:.2f} - acc : {:.3f} time taken: {:.2f}""".format(epoch, i, 
        len(train_loader), train_loss/(i+1),
        accuracy(preds, labels), valid_loss, accuracy(valid_preds, valid_labels),
        time.time()-start), flush=True)
  return model

def learning_rate_decay(optimizer):
  for param_group in optimizer.param_groups:
        param_group['lr'] = param_group['lr'] * 0.1
  return optimizer

def training(model, epoches, lr, wd):
  if torch.cuda.is_available():
    model.cuda()
  optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
  criterion = nn.CrossEntropyLoss()
  for ep in range(epoches):
    model = train_epoch(ep, model, optimizer, criterion)
    optimizer = learning_rate_decay(optimizer)
  return model

def validate(model, criterion):
  model.eval()
  valid_loss = 0
  preds, labels = [], []
  for i, (x, y) in enumerate(valid_loader):
    labels.extend(y.tolist())
    if torch.cuda.is_available(): x, y = x.cuda(), y.cuda()
    out = model(x)
    loss = criterion(out, y)
    preds.extend(out.argmax(axis=1).tolist())
    valid_loss += loss
  return model, preds, labels, valid_loss/(i+1)
    
def predict(model, loader):
  model.eval()
  preds, labels = [], []
  for i, (x, _) in enumerate(loader):
    if torch.cuda.is_available(): x = x.cuda()
    out = model(x)
    preds.extend(out.argmax(axis=1).tolist())
  return preds


#### Model
---

In [0]:
from torch.nn.utils import weight_norm

class LSTM_clf(nn.Module):

  def __init__(self, embed_dim, hidden_dim, vocab_size, out_size, 
               layers=1, bidirectional=False):
    super(LSTM_clf, self).__init__()
    self.word_embedding = nn.Embedding(vocab_size, embed_dim)
    self.net = nn.LSTM(embed_dim, hidden_dim,  num_layers=layers, 
                       bidirectional=bidirectional, dropout=0.5)
    self.relu = nn.ReLU()
    self.bn = nn.BatchNorm1d(hidden_dim * (int(bidirectional) + 1))
    self.linear = nn.Linear(hidden_dim * (int(bidirectional) + 1), out_size)

  def forward(self, x):
    out = self.word_embedding(x)
    out = self.net(out)[0]
    out = self.relu(out).transpose(1,2)
    out = F.max_pool1d(out, out.size()[2]).squeeze()
    out = self.linear(self.bn(out))
    return out

class DCNN_block(nn.Module):
  
  def __init__(self, embed_dim, hidden_dim, kernel_size, dilations=None,
               dropout=0.2):
    super(DCNN_block, self).__init__()
    self.conv1 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, dilation=1))
    self.conv2 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, dilation=2))
    self.conv3 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, dilation=4))
    self.net = nn.Sequential(self.conv1, nn.ReLU(), nn.Dropout(dropout),
                             self.conv2, nn.ReLU(), nn.Dropout(dropout), 
                             self.conv3, nn.ReLU(), nn.Dropout(dropout))
  
  def forward(self, x):
    # N x C x L
    return self.net(x)

class DCNN_rez_block(nn.Module):
  
  def __init__(self, embed_dim, hidden_dim, kernel_size, dilations=None,
               dropout=0.2):
    super(DCNN_rez_block, self).__init__()
    self.conv1 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, 
                                       padding=(kernel_size-1)*1, dilation=1))
    self.conv2 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, 
                                       padding=(kernel_size-1)*2, dilation=2))
    self.conv3 = weight_norm(nn.Conv1d(embed_dim, hidden_dim, kernel_size, 
                                       padding=(kernel_size-1)*4, dilation=4))

    self.relu1 = nn.ReLU()
    self.relu2 = nn.ReLU()
    self.relu3 = nn.ReLU()

    self.do1 = nn.Dropout(dropout)
    self.do2 = nn.Dropout(dropout)
    self.do3 = nn.Dropout(dropout)
  
  def forward(self, x):
    # N x C x L
    seq_len = x.size()[2]
    out = self.do1(self.relu1(self.conv1(x)))[:, :, -seq_len:]
    out = out + self.do2(self.relu2(self.conv2(x)))[:, :, -seq_len:]
    out = out + self.do3(self.relu3(self.conv3(x)))[:, :, -seq_len:]
    return out


class DCNN(nn.Module):

  def __init__(self, embed_dim, hidden_dim, vocab_size, out_size, 
               kernel_size, dilations=None, rez_block=True, 
               dropout=0.2):
    super(DCNN, self).__init__()
    self.word_embedding = nn.Embedding(vocab_size, embed_dim)
    if rez_block: 
      self.net = DCNN_rez_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)
    else:
      self.net = DCNN_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)
    self.bn = nn.BatchNorm1d(hidden_dim)
    self.do = nn.Dropout(dropout)
    self.linear = nn.Linear(hidden_dim, out_size)

  def forward(self, x):
    out = self.word_embedding(x)
    out = self.net(out.transpose(1,2))
    out = F.max_pool1d(out, out.size()[2]).squeeze()
    out = self.linear(self.do(self.bn(out)))
    return out


class DDCNN(nn.Module):
  # Dilated and Dense CNN
  def __init__(self, embed_dim, hidden_dim, vocab_size, out_size, 
               kernel_size, dilations=None, rez_block=True, 
               dropout=0.2):
    super(DDCNN, self).__init__()
    self.word_embedding = nn.Embedding(vocab_size, embed_dim)
    if rez_block: 
      self.dcnn = DCNN_rez_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)
    else:
      self.dcnn = DCNN_block(embed_dim, hidden_dim, kernel_size, dilations, dropout)

    self.do1 = nn.Dropout(dropout)
    self.do2 = nn.Dropout(dropout)
    self.do3 = nn.Dropout(dropout)
    self.cnn1 = weight_norm(nn.Conv1d(embed_dim, int(hidden_dim//3), 4, padding=3, dilation=1))
    self.cnn2 = weight_norm(nn.Conv1d(embed_dim, int(hidden_dim//3), 6, padding=5, dilation=1))
    self.cnn3 = weight_norm(nn.Conv1d(embed_dim, int(hidden_dim//3), 8, padding=7, dilation=1))
    
    self.bn = nn.BatchNorm1d(hidden_dim*2)
    self.do = nn.Dropout(dropout)
    self.linear = nn.Linear(hidden_dim*2, out_size)

  def cnn(self, x):
    out1 = F.relu(self.cnn1(self.do1(x)))
    out2 = F.relu(self.cnn2(self.do2(x)))
    out3 = F.relu(self.cnn3(self.do3(x)))
    outs = []
    for o in [out1, out2, out3]:
      outs.append(F.max_pool1d(o, o.size()[2]).squeeze())
    out = torch.cat(outs, 1)
    return out

  def forward(self, x):
    out = self.word_embedding(x).transpose(1,2)
    dcnn_out = self.dcnn(out)
    cnn_out = self.cnn(out)
    dcnn_out = F.max_pool1d(dcnn_out, dcnn_out.size()[2]).squeeze()
    out = self.linear(self.do(self.bn(torch.cat((dcnn_out,cnn_out), 1))))
    return out

In [0]:
torch.manual_seed(1)
bs = 64
n_class = 16
epochs = 3
lstm_hidden = 300
cnn_hidden = 300
embed_dim = 300
layers = 2
kernel_size = 3
vocab_size = len(vocab.vocab)
is_cuda = torch.cuda.is_available()
lr = 0.002
grad_clip = 1
print_iter = 500
lstm1 = LSTM_clf(embed_dim, lstm_hidden, vocab_size, n_class, layers)
dcnn1 = DCNN(embed_dim, cnn_hidden, vocab_size, n_class, 3, 
             rez_block=False, dropout=0.2)
dcnn_rez1 = DCNN(500, 500, vocab_size, n_class, 5, 
                 rez_block=True, dropout=0.2)
dcnn_rez2 = DCNN(embed_dim, 300, vocab_size, n_class, 3, 
                 rez_block=True, dropout=0.2)
dcnn_rez3 = DCNN(embed_dim, 500, vocab_size, n_class, 3, 
                 rez_block=True, dropout=0.3)
ddcnn_rez1 = DDCNN(500, 600, vocab_size, n_class, 5, 
                 rez_block=True, dropout=0.2)
ddcnn_rez2 = DDCNN(embed_dim, 150, vocab_size, n_class, 3, 
                 rez_block=True, dropout=0.2)
ddcnn_rez3 = DDCNN(embed_dim, 300, vocab_size, n_class, 3, 
                 rez_block=True, dropout=0.1)

In [0]:
train_loader = DataLoader(train, batch_size=bs, shuffle=True, collate_fn=batchify)
valid_loader = DataLoader(valid, batch_size=bs, shuffle=False, collate_fn=batchify)
test_loader = DataLoader(test, batch_size=bs, shuffle=False, collate_fn=batchify_test)

In [22]:
%time training(ddcnn_rez1, 2, 2e-3, 1e-4)  

epoch 0 - batch [499/3968] - train loss: 1.80 - acc: 0.472 - valid loss : 1.36 - acc : 0.605 time taken: 36.47
epoch 0 - batch [999/3968] - train loss: 0.70 - acc: 0.533 - valid loss : 1.17 - acc : 0.672 time taken: 37.48
epoch 0 - batch [1499/3968] - train loss: 0.41 - acc: 0.568 - valid loss : 0.86 - acc : 0.712 time taken: 37.18
epoch 0 - batch [1999/3968] - train loss: 0.27 - acc: 0.596 - valid loss : 0.92 - acc : 0.726 time taken: 37.39
epoch 0 - batch [2499/3968] - train loss: 0.21 - acc: 0.614 - valid loss : 0.96 - acc : 0.747 time taken: 37.31
epoch 0 - batch [2999/3968] - train loss: 0.16 - acc: 0.631 - valid loss : 0.85 - acc : 0.770 time taken: 37.36
epoch 0 - batch [3499/3968] - train loss: 0.13 - acc: 0.645 - valid loss : 0.90 - acc : 0.773 time taken: 37.48
epoch 0 - batch [3967/3968] - train loss: 0.10 - acc: 0.655 - valid loss : 0.86 - acc : 0.795 time taken: 35.10
epoch 1 - batch [499/3968] - train loss: 0.62 - acc: 0.814 - valid loss : 0.85 - acc : 0.810 time taken: 3

DDCNN(
  (word_embedding): Embedding(113157, 500)
  (dcnn): DCNN_rez_block(
    (conv1): Conv1d(500, 600, kernel_size=(5,), stride=(1,), padding=(4,))
    (conv2): Conv1d(500, 600, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(2,))
    (conv3): Conv1d(500, 600, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(4,))
    (relu1): ReLU()
    (relu2): ReLU()
    (relu3): ReLU()
    (do1): Dropout(p=0.2, inplace=False)
    (do2): Dropout(p=0.2, inplace=False)
    (do3): Dropout(p=0.2, inplace=False)
  )
  (do1): Dropout(p=0.2, inplace=False)
  (do2): Dropout(p=0.2, inplace=False)
  (do3): Dropout(p=0.2, inplace=False)
  (cnn1): Conv1d(500, 200, kernel_size=(4,), stride=(1,), padding=(3,))
  (cnn2): Conv1d(500, 200, kernel_size=(6,), stride=(1,), padding=(5,))
  (cnn3): Conv1d(500, 200, kernel_size=(8,), stride=(1,), padding=(7,))
  (bn): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do): Dropout(p=0.2, inplace=False)
  (linear): Linear(i

In [23]:
%time training(ddcnn_rez2, 2, 2e-3, 0) 

epoch 0 - batch [499/3968] - train loss: 1.78 - acc: 0.454 - valid loss : 1.25 - acc : 0.631 time taken: 14.52
epoch 0 - batch [999/3968] - train loss: 0.68 - acc: 0.520 - valid loss : 1.07 - acc : 0.701 time taken: 14.58
epoch 0 - batch [1499/3968] - train loss: 0.40 - acc: 0.559 - valid loss : 0.92 - acc : 0.726 time taken: 14.66
epoch 0 - batch [1999/3968] - train loss: 0.27 - acc: 0.585 - valid loss : 1.00 - acc : 0.729 time taken: 14.64
epoch 0 - batch [2499/3968] - train loss: 0.20 - acc: 0.607 - valid loss : 0.88 - acc : 0.759 time taken: 14.54
epoch 0 - batch [2999/3968] - train loss: 0.16 - acc: 0.622 - valid loss : 0.92 - acc : 0.776 time taken: 14.47
epoch 0 - batch [3499/3968] - train loss: 0.13 - acc: 0.635 - valid loss : 0.87 - acc : 0.807 time taken: 14.27
epoch 0 - batch [3967/3968] - train loss: 0.11 - acc: 0.646 - valid loss : 0.82 - acc : 0.823 time taken: 13.24
epoch 1 - batch [499/3968] - train loss: 0.71 - acc: 0.782 - valid loss : 0.84 - acc : 0.818 time taken: 1

DDCNN(
  (word_embedding): Embedding(113157, 300)
  (dcnn): DCNN_rez_block(
    (conv1): Conv1d(300, 150, kernel_size=(3,), stride=(1,), padding=(2,))
    (conv2): Conv1d(300, 150, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
    (conv3): Conv1d(300, 150, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(4,))
    (relu1): ReLU()
    (relu2): ReLU()
    (relu3): ReLU()
    (do1): Dropout(p=0.2, inplace=False)
    (do2): Dropout(p=0.2, inplace=False)
    (do3): Dropout(p=0.2, inplace=False)
  )
  (do1): Dropout(p=0.2, inplace=False)
  (do2): Dropout(p=0.2, inplace=False)
  (do3): Dropout(p=0.2, inplace=False)
  (cnn1): Conv1d(300, 50, kernel_size=(4,), stride=(1,), padding=(3,))
  (cnn2): Conv1d(300, 50, kernel_size=(6,), stride=(1,), padding=(5,))
  (cnn3): Conv1d(300, 50, kernel_size=(8,), stride=(1,), padding=(7,))
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_fea

In [24]:
%time training(ddcnn_rez3, 2, 2e-3, 0) 

epoch 0 - batch [499/3968] - train loss: 1.70 - acc: 0.484 - valid loss : 1.20 - acc : 0.649 time taken: 15.91
epoch 0 - batch [999/3968] - train loss: 0.66 - acc: 0.545 - valid loss : 1.08 - acc : 0.692 time taken: 15.98
epoch 0 - batch [1499/3968] - train loss: 0.39 - acc: 0.580 - valid loss : 1.04 - acc : 0.729 time taken: 15.94
epoch 0 - batch [1999/3968] - train loss: 0.26 - acc: 0.606 - valid loss : 1.00 - acc : 0.750 time taken: 16.01
epoch 0 - batch [2499/3968] - train loss: 0.20 - acc: 0.625 - valid loss : 0.93 - acc : 0.779 time taken: 16.02
epoch 0 - batch [2999/3968] - train loss: 0.15 - acc: 0.640 - valid loss : 0.87 - acc : 0.779 time taken: 16.06
epoch 0 - batch [3499/3968] - train loss: 0.13 - acc: 0.653 - valid loss : 0.77 - acc : 0.788 time taken: 16.04
epoch 0 - batch [3967/3968] - train loss: 0.10 - acc: 0.663 - valid loss : 0.80 - acc : 0.793 time taken: 14.86
epoch 1 - batch [499/3968] - train loss: 0.62 - acc: 0.811 - valid loss : 0.78 - acc : 0.795 time taken: 1

DDCNN(
  (word_embedding): Embedding(113157, 300)
  (dcnn): DCNN_rez_block(
    (conv1): Conv1d(300, 300, kernel_size=(3,), stride=(1,), padding=(2,))
    (conv2): Conv1d(300, 300, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
    (conv3): Conv1d(300, 300, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(4,))
    (relu1): ReLU()
    (relu2): ReLU()
    (relu3): ReLU()
    (do1): Dropout(p=0.1, inplace=False)
    (do2): Dropout(p=0.1, inplace=False)
    (do3): Dropout(p=0.1, inplace=False)
  )
  (do1): Dropout(p=0.1, inplace=False)
  (do2): Dropout(p=0.1, inplace=False)
  (do3): Dropout(p=0.1, inplace=False)
  (cnn1): Conv1d(300, 100, kernel_size=(4,), stride=(1,), padding=(3,))
  (cnn2): Conv1d(300, 100, kernel_size=(6,), stride=(1,), padding=(5,))
  (cnn3): Conv1d(300, 100, kernel_size=(8,), stride=(1,), padding=(7,))
  (bn): BatchNorm1d(600, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do): Dropout(p=0.1, inplace=False)
  (linear): Linear(in_

In [25]:
%time training(dcnn_rez1, 2, 2e-3, 1e-4)  

epoch 0 - batch [499/3968] - train loss: 1.82 - acc: 0.446 - valid loss : 1.15 - acc : 0.628 time taken: 26.83
epoch 0 - batch [999/3968] - train loss: 0.66 - acc: 0.523 - valid loss : 1.12 - acc : 0.680 time taken: 26.98
epoch 0 - batch [1499/3968] - train loss: 0.38 - acc: 0.567 - valid loss : 0.99 - acc : 0.705 time taken: 27.06
epoch 0 - batch [1999/3968] - train loss: 0.26 - acc: 0.596 - valid loss : 0.88 - acc : 0.750 time taken: 27.08
epoch 0 - batch [2499/3968] - train loss: 0.20 - acc: 0.618 - valid loss : 0.87 - acc : 0.762 time taken: 27.09
epoch 0 - batch [2999/3968] - train loss: 0.15 - acc: 0.635 - valid loss : 0.81 - acc : 0.785 time taken: 27.15
epoch 0 - batch [3499/3968] - train loss: 0.13 - acc: 0.649 - valid loss : 0.82 - acc : 0.790 time taken: 26.99
epoch 0 - batch [3967/3968] - train loss: 0.10 - acc: 0.660 - valid loss : 0.83 - acc : 0.792 time taken: 24.86
epoch 1 - batch [499/3968] - train loss: 0.60 - acc: 0.818 - valid loss : 0.80 - acc : 0.796 time taken: 2

DCNN(
  (word_embedding): Embedding(113157, 500)
  (net): DCNN_rez_block(
    (conv1): Conv1d(500, 500, kernel_size=(5,), stride=(1,), padding=(4,))
    (conv2): Conv1d(500, 500, kernel_size=(5,), stride=(1,), padding=(8,), dilation=(2,))
    (conv3): Conv1d(500, 500, kernel_size=(5,), stride=(1,), padding=(16,), dilation=(4,))
    (relu1): ReLU()
    (relu2): ReLU()
    (relu3): ReLU()
    (do1): Dropout(p=0.2, inplace=False)
    (do2): Dropout(p=0.2, inplace=False)
    (do3): Dropout(p=0.2, inplace=False)
  )
  (bn): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=500, out_features=16, bias=True)
)

In [26]:
%time training(dcnn_rez2, 2, 2e-3, 0)    

epoch 0 - batch [499/3968] - train loss: 1.79 - acc: 0.451 - valid loss : 1.26 - acc : 0.621 time taken: 13.09
epoch 0 - batch [999/3968] - train loss: 0.68 - acc: 0.517 - valid loss : 1.12 - acc : 0.677 time taken: 13.12
epoch 0 - batch [1499/3968] - train loss: 0.39 - acc: 0.559 - valid loss : 1.03 - acc : 0.725 time taken: 13.15
epoch 0 - batch [1999/3968] - train loss: 0.27 - acc: 0.587 - valid loss : 0.96 - acc : 0.737 time taken: 13.15
epoch 0 - batch [2499/3968] - train loss: 0.20 - acc: 0.608 - valid loss : 0.95 - acc : 0.756 time taken: 13.15
epoch 0 - batch [2999/3968] - train loss: 0.16 - acc: 0.625 - valid loss : 0.89 - acc : 0.781 time taken: 13.15
epoch 0 - batch [3499/3968] - train loss: 0.13 - acc: 0.638 - valid loss : 0.85 - acc : 0.787 time taken: 13.07
epoch 0 - batch [3967/3968] - train loss: 0.11 - acc: 0.649 - valid loss : 0.79 - acc : 0.793 time taken: 12.07
epoch 1 - batch [499/3968] - train loss: 0.72 - acc: 0.778 - valid loss : 0.79 - acc : 0.790 time taken: 1

DCNN(
  (word_embedding): Embedding(113157, 300)
  (net): DCNN_rez_block(
    (conv1): Conv1d(300, 300, kernel_size=(3,), stride=(1,), padding=(2,))
    (conv2): Conv1d(300, 300, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
    (conv3): Conv1d(300, 300, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(4,))
    (relu1): ReLU()
    (relu2): ReLU()
    (relu3): ReLU()
    (do1): Dropout(p=0.2, inplace=False)
    (do2): Dropout(p=0.2, inplace=False)
    (do3): Dropout(p=0.2, inplace=False)
  )
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=300, out_features=16, bias=True)
)

In [27]:
%time training(dcnn_rez3, 2, 2e-3, 1e-4)    

epoch 0 - batch [499/3968] - train loss: 1.88 - acc: 0.427 - valid loss : 1.42 - acc : 0.586 time taken: 14.94
epoch 0 - batch [999/3968] - train loss: 0.70 - acc: 0.501 - valid loss : 1.19 - acc : 0.621 time taken: 14.97
epoch 0 - batch [1499/3968] - train loss: 0.41 - acc: 0.543 - valid loss : 1.05 - acc : 0.694 time taken: 15.02
epoch 0 - batch [1999/3968] - train loss: 0.28 - acc: 0.573 - valid loss : 0.91 - acc : 0.745 time taken: 15.03
epoch 0 - batch [2499/3968] - train loss: 0.21 - acc: 0.596 - valid loss : 0.90 - acc : 0.759 time taken: 15.04
epoch 0 - batch [2999/3968] - train loss: 0.17 - acc: 0.613 - valid loss : 0.89 - acc : 0.767 time taken: 15.08
epoch 0 - batch [3499/3968] - train loss: 0.13 - acc: 0.628 - valid loss : 0.90 - acc : 0.768 time taken: 15.06
epoch 0 - batch [3967/3968] - train loss: 0.11 - acc: 0.639 - valid loss : 0.86 - acc : 0.788 time taken: 14.11
epoch 1 - batch [499/3968] - train loss: 0.74 - acc: 0.774 - valid loss : 0.84 - acc : 0.790 time taken: 1

DCNN(
  (word_embedding): Embedding(113157, 300)
  (net): DCNN_rez_block(
    (conv1): Conv1d(300, 500, kernel_size=(3,), stride=(1,), padding=(2,))
    (conv2): Conv1d(300, 500, kernel_size=(3,), stride=(1,), padding=(4,), dilation=(2,))
    (conv3): Conv1d(300, 500, kernel_size=(3,), stride=(1,), padding=(8,), dilation=(4,))
    (relu1): ReLU()
    (relu2): ReLU()
    (relu3): ReLU()
    (do1): Dropout(p=0.3, inplace=False)
    (do2): Dropout(p=0.3, inplace=False)
    (do3): Dropout(p=0.3, inplace=False)
  )
  (bn): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do): Dropout(p=0.3, inplace=False)
  (linear): Linear(in_features=500, out_features=16, bias=True)
)

In [28]:
%time training(dcnn1, 2, 2e-3, 0)

epoch 0 - batch [499/3968] - train loss: 2.06 - acc: 0.343 - valid loss : 1.42 - acc : 0.558 time taken: 11.57
epoch 0 - batch [999/3968] - train loss: 0.66 - acc: 0.466 - valid loss : 1.14 - acc : 0.672 time taken: 11.58
epoch 0 - batch [1499/3968] - train loss: 0.39 - acc: 0.525 - valid loss : 1.00 - acc : 0.712 time taken: 11.47
epoch 0 - batch [1999/3968] - train loss: 0.27 - acc: 0.561 - valid loss : 0.98 - acc : 0.729 time taken: 11.35
epoch 0 - batch [2499/3968] - train loss: 0.20 - acc: 0.588 - valid loss : 0.95 - acc : 0.748 time taken: 11.22
epoch 0 - batch [2999/3968] - train loss: 0.16 - acc: 0.608 - valid loss : 0.92 - acc : 0.754 time taken: 11.24
epoch 0 - batch [3499/3968] - train loss: 0.13 - acc: 0.624 - valid loss : 0.91 - acc : 0.771 time taken: 11.18
epoch 0 - batch [3967/3968] - train loss: 0.10 - acc: 0.636 - valid loss : 0.85 - acc : 0.767 time taken: 10.45
epoch 1 - batch [499/3968] - train loss: 0.73 - acc: 0.780 - valid loss : 0.82 - acc : 0.779 time taken: 1

DCNN(
  (word_embedding): Embedding(113157, 300)
  (net): DCNN_block(
    (conv1): Conv1d(300, 300, kernel_size=(3,), stride=(1,))
    (conv2): Conv1d(300, 300, kernel_size=(3,), stride=(1,), dilation=(2,))
    (conv3): Conv1d(300, 300, kernel_size=(3,), stride=(1,), dilation=(4,))
    (net): Sequential(
      (0): Conv1d(300, 300, kernel_size=(3,), stride=(1,))
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Conv1d(300, 300, kernel_size=(3,), stride=(1,), dilation=(2,))
      (4): ReLU()
      (5): Dropout(p=0.2, inplace=False)
      (6): Conv1d(300, 300, kernel_size=(3,), stride=(1,), dilation=(4,))
      (7): ReLU()
      (8): Dropout(p=0.2, inplace=False)
    )
  )
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (do): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=300, out_features=16, bias=True)
)

In [29]:
%time training(lstm1, 2, 2e-3, 0)

epoch 0 - batch [499/3968] - train loss: 1.99 - acc: 0.371 - valid loss : 1.43 - acc : 0.574 time taken: 16.95
epoch 0 - batch [999/3968] - train loss: 0.70 - acc: 0.475 - valid loss : 1.30 - acc : 0.663 time taken: 16.96
epoch 0 - batch [1499/3968] - train loss: 0.41 - acc: 0.526 - valid loss : 1.15 - acc : 0.708 time taken: 17.05
epoch 0 - batch [1999/3968] - train loss: 0.29 - acc: 0.557 - valid loss : 1.09 - acc : 0.728 time taken: 17.09
epoch 0 - batch [2499/3968] - train loss: 0.22 - acc: 0.579 - valid loss : 1.12 - acc : 0.717 time taken: 17.18
epoch 0 - batch [2999/3968] - train loss: 0.18 - acc: 0.595 - valid loss : 0.99 - acc : 0.739 time taken: 17.17
epoch 0 - batch [3499/3968] - train loss: 0.15 - acc: 0.609 - valid loss : 1.03 - acc : 0.739 time taken: 17.29
epoch 0 - batch [3967/3968] - train loss: 0.12 - acc: 0.619 - valid loss : 1.02 - acc : 0.759 time taken: 16.03
epoch 1 - batch [499/3968] - train loss: 0.90 - acc: 0.724 - valid loss : 0.99 - acc : 0.759 time taken: 1

LSTM_clf(
  (word_embedding): Embedding(113157, 300)
  (net): LSTM(300, 300, num_layers=2, dropout=0.5)
  (relu): ReLU()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (linear): Linear(in_features=300, out_features=16, bias=True)
)

In [0]:
models = [dcnn1, dcnn_rez1, dcnn_rez2, dcnn_rez3, lstm1, ddcnn_rez1, ddcnn_rez2, ddcnn_rez3]

In [45]:
criterion = nn.CrossEntropyLoss()
valid_preds_dict = {}
test_preds_dict = {}
for i, model in enumerate(models):
    _, valid_preds, valid_labels, valid_loss = validate(model, criterion)
    print("accuracy: ", accuracy(valid_preds, valid_labels))
    valid_preds_ = predict(model, valid_loader)
    print("verify predicts won't mess up order", accuracy(valid_preds, valid_preds))
    print("generating test preds")
    test_preds = predict(model, test_loader)
    
    valid_preds_dict["model_{}".format(i)] = valid_preds_
    test_preds_dict["model_{}".format(i)] = test_preds
    

accuracy:  0.7962674961119751
verify predicts won't mess up order 1.0
generating test preds
accuracy:  0.8055987558320373
verify predicts won't mess up order 1.0
generating test preds
accuracy:  0.8055987558320373
verify predicts won't mess up order 1.0
generating test preds
accuracy:  0.8211508553654744
verify predicts won't mess up order 1.0
generating test preds
accuracy:  0.7698289269051322
verify predicts won't mess up order 1.0
generating test preds
accuracy:  0.8227060653188181
verify predicts won't mess up order 1.0
generating test preds
accuracy:  0.8242612752721618
verify predicts won't mess up order 1.0
generating test preds
accuracy:  0.8040435458786936
verify predicts won't mess up order 1.0
generating test preds


In [0]:
valid_preds = pd.DataFrame(valid_preds_dict)
test_preds = pd.DataFrame(test_preds_dict)

In [0]:
valid_preds["majority_vote"] = valid_preds.apply(lambda row: row.value_counts().index[0].astype(int), axis=1)
test_preds["majority_vote"] = test_preds.apply(lambda row: row.value_counts().index[0].astype(int), axis=1)
valid_preds["pred_label"] = valid_preds["majority_vote"].apply(lambda x: label_vocab.i2w[x])
test_preds["pred_label"] = test_preds["majority_vote"].apply(lambda x: label_vocab.i2w[x])
valid_preds.to_csv("valid_predictions.csv", index=False)
test_preds.to_csv("test_predictions.csv", index=False)

In [48]:
valid_preds.head()

Unnamed: 0,model_0,model_1,model_2,model_3,model_4,model_5,model_6,model_7,majority_vote,pred_label
0,13,13,13,13,13,13,13,13,13,sports and recreation
1,13,13,13,13,13,13,13,13,13,sports and recreation
2,13,13,13,13,13,13,13,13,13,sports and recreation
3,7,7,7,7,7,7,7,7,7,media and drama
4,7,7,7,11,11,11,11,4,11,philosophy and religion


#### Prediction
---

In [49]:
print(accuracy(valid_preds["majority_vote"].values, valid_labels))

0.8413685847589425


In [0]:
with open("dev_results.txt", "w") as f:
    for s in label_vocab.map_index2words(valid_preds.majority_vote.values):
        f.write(s + '\n')

In [0]:
with open("test_results.txt", "w") as f:
    for s in label_vocab.map_index2words(test_preds.majority_vote.values):
        f.write(s + '\n')