-- use a better tokenization

-- you may need to change some of the parameters (for example sentence length)

In [1]:
import numpy as np
import os
from collections import defaultdict
import re
import string
import spacy
from spacy.symbols import ORTH
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
! head aclImdb/train/pos/0_9.txt

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!

In [3]:
train_pos = os.listdir("aclImdb/train/pos/")
train_neg = os.listdir("aclImdb/train/neg/")
test_pos = os.listdir("aclImdb/test/pos/")
test_neg = os.listdir("aclImdb/test/neg/")

In [4]:
def read_contents(dir_path, file_list):
    content_list = []
    for f in file_list:
        with open(os.path.join(dir_path, f), "r") as file:
            content = file.read()
            content_list.append(content)
    return content_list

In [5]:
train_pos_content = read_contents("aclImdb/train/pos/", train_pos)
train_neg_content = read_contents("aclImdb/train/neg/", train_neg)
test_pos_content = read_contents("aclImdb/test/pos/", test_pos)
test_neg_content = read_contents("aclImdb/test/neg/", test_neg)

In [6]:
train_content = train_pos_content + train_neg_content
test_content = test_pos_content + test_neg_content

In [7]:
train_pos = pd.DataFrame({"content": train_pos_content,
                         "pos_neg": 1})
train_neg = pd.DataFrame({"content": train_neg_content,
                         "pos_neg": 0})

In [8]:
train = train_pos.append(train_neg)

In [9]:
test_pos = pd.DataFrame({"content": test_pos_content,
                         "pos_neg": 1})
test_neg = pd.DataFrame({"content": test_neg_content,
                         "pos_neg": 0})
test = test_pos.append(test_neg)

In [10]:
print(train.shape)
print(test.shape)

(25000, 2)
(25000, 2)


### String cleaning functions

In [11]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): 
    return re_br.sub("\n", x)

def remove_punctuations(x):
    for c in string.punctuation:
        x = x.replace(c,"")
    return x

my_tok = spacy.load("en")
def spacy_tok(x): 
    return [tok.text for tok in my_tok.tokenizer(sub_br(remove_punctuations(x)))]

In [12]:
def get_non_stopwords(content):
    """Returns a list of non-stopwords"""
    return {x:1 for x in spacy_tok(str(content).lower()) if x not in stops}

In [13]:
def get_vocab(list_of_content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = get_non_stopwords(list_of_content)
    words = [x for x in spacy_tok(str(list_of_content).lower())]
    for w in words:
        if w in vocab:
            vocab[w] += 1
    return vocab


In [14]:
data_vocab = get_vocab(train_content)

### Glove embeddings

In [15]:
def loadGloveModel(gloveFile="glove.6B.300d.txt"):
    """ Loads word vectors into a dictionary."""
    f = open(gloveFile,'r')
    word_vecs = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        word_vecs[word] = np.array([float(val) for val in splitLine[1:]])
    return word_vecs

In [16]:
embeddings = loadGloveModel()

In [17]:
len(embeddings.keys())

400000

In [18]:
def delete_rare_words(embeddings, data_vocab, min_df=2):
    words_delete = []
    for word in data_vocab:
        if data_vocab[word] < min_df and word not in embeddings:
            words_delete.append(word)
#     print(words_delete)
    for word in words_delete: data_vocab.pop(word)
    return data_vocab

In [19]:
data_vocab = delete_rare_words(embeddings, data_vocab, min_df=2)

In [20]:
def create_embedding_matrix(embeddings, data_vocab, min_df=2, D=300):
    """Creates embedding matrix from word vectors. """
    data_vocab = delete_rare_words(embeddings, data_vocab, min_df)
    V = len(data_vocab.keys()) + 2
    vocab2index = {}
    W = np.zeros((V, D), dtype="float32")
    vocab = ["", "UNK"]
    # adding a vector for padding
    W[0] = np.zeros(D, dtype='float32')
    # adding a vector for rare words 
    W[1] = np.random.uniform(-0.25,0.25,D)
    vocab2index["UNK"] = 1
    i = 2
    for word in data_vocab:
        if word in embeddings:
            W[i] = embeddings[word]
            vocab2index[word] = i
            vocab.append(word)
            i += 1
        else:
            W[i] = np.random.uniform(-0.25,0.25,D)
            vocab2index[word] = i
            vocab.append(word)
            i += 1   
    return W, np.array(vocab), vocab2index

In [21]:
pretrained_weight, vocab, vocab2index = create_embedding_matrix(embeddings, data_vocab)

In [22]:
len(pretrained_weight)

121499

In [23]:
D = 300
V = len(pretrained_weight)
emb = nn.Embedding(V, D)
emb.weight.data.copy_(torch.from_numpy(pretrained_weight))

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0556,  0.2216, -0.2337,  ..., -0.0201,  0.1580, -0.1758],
        [ 0.1325, -0.3820,  0.0549,  ..., -0.2868,  0.2870,  0.3807],
        ...,
        [ 0.4018, -0.2759, -0.1173,  ..., -0.2185,  0.0948, -0.1024],
        [-0.0162, -0.5028,  0.2609,  ...,  0.1186,  0.2515, -0.0861],
        [-0.0203, -0.0016,  0.1310,  ...,  0.1015,  0.0639,  0.0967]])

In [24]:
def encode_sentence(s, N=40):
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([vocab2index.get(w, vocab2index["UNK"]) for w in s.split()])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc

In [25]:
x_train = np.stack([encode_sentence(x) for x in train.content.values])
x_train.shape

(25000, 40)

In [26]:
x_test = np.stack([encode_sentence(x) for x in test.content.values])
x_test.shape

(25000, 40)

In [27]:
class SentenceCNN(nn.Module):
    
    def __init__(self, V, D, glove_weights):
        super(SentenceCNN, self).__init__()
        self.glove_weights = glove_weights
        self.embedding = nn.Embedding(V, D, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(self.glove_weights))
        self.embedding.weight.requires_grad = False ## freeze embeddings

        self.conv_3 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=3)
        self.conv_4 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=4)
        self.conv_5 = nn.Conv1d(in_channels=D, out_channels=100, kernel_size=5)
        
        self.dropout = nn.Dropout(p=0.5)
        self.fc = nn.Linear(300, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        x = x.transpose(1,2)
        x3 = F.relu(self.conv_3(x))
        x4 = F.relu(self.conv_4(x))
        x5 = F.relu(self.conv_5(x))
        x3 = nn.MaxPool1d(kernel_size = 38)(x3)
        x4 = nn.MaxPool1d(kernel_size = 37)(x4)
        x5 = nn.MaxPool1d(kernel_size = 36)(x5)
        out = torch.cat([x3, x4, x5], 2)
        out = out.view(out.size(0), -1)
        out = self.dropout(out)
        return self.fc(out)

In [67]:
V = len(pretrained_weight)
D = 300
N = 100
model = SentenceCNN(V, D, glove_weights=pretrained_weight).cuda()

### Training

In [68]:
y_train = train.pos_neg.values.astype(np.float32)
y_test = test.pos_neg.values.astype(np.float32)

In [69]:
class WiDSDataset(Dataset):
    def __init__(self, x, y):
        x = x.copy()
        self.x = x.copy().astype(np.int64)
        self.y = y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return [self.x[idx], self.y[idx]]    

In [70]:
train_ds = WiDSDataset(x_train, y_train)
test_ds = WiDSDataset(x_test, y_test)

In [71]:
batch_size = 1000
train_dl = DataLoader(train_ds, batch_size=batch_size)
test_dl = DataLoader(test_ds, batch_size=batch_size)

In [72]:
def get_optimizer(model, lr=0.01, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optimizer

In [73]:
def train_model(model, optimizer, train_dl=train_dl, verbose=False):
    model.train()
    total = 0
    sum_loss = 0
    for i, (x, y) in enumerate(train_dl):
        batch = y.shape[0]
        x = torch.LongTensor(x).cuda()
        y = torch.Tensor(y).unsqueeze(1).cuda()
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total += batch
        sum_loss += batch * (loss.item())
        if verbose: 
            print(sum_loss/total)
    return sum_loss/total

In [74]:
def test_loss(model, test_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    accuracy = 0
    for i, (x, y) in enumerate(test_dl):
        batch = y.shape[0]
        x = torch.LongTensor(x).cuda()
        y = torch.Tensor(y).unsqueeze(1).cuda()
        y_hat = model(x)
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        sum_loss += batch*(loss.item())
        total += batch
        pred = (y_hat > 0).float()
        correct += (pred == y).float().sum().item()
        accuracy += correct/pred.shape[0]
    print("test loss and accuracy", sum_loss/total, accuracy/total)
    return sum_loss/total, accuracy/total

In [75]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr=lr, wd=wd)
    for i in range(epochs):
        loss = train_model(model, optim, train_dl)
        print("loss", loss)
        test_loss(model, test_dl)

In [76]:
train_loop(model, epochs=10, lr=0.01)

loss 8.831651431482427
test loss and accuracy 0.6948733234405517 0.00338464
loss 0.702161238193512
test loss and accuracy 0.6957425785064697 0.00338324
loss 0.7027262687683106
test loss and accuracy 0.6952866411209107 0.003398
loss 0.6986085176467896
test loss and accuracy 0.6947899055480957 0.0034235199999999998
loss 0.6969630908966065
test loss and accuracy 0.6944407200813294 0.00346036
loss 0.6949128079414367
test loss and accuracy 0.6937753248214722 0.00348728
loss 0.6934193348884583
test loss and accuracy 0.6928571820259094 0.0035606000000000006
loss 0.6900584936141968
test loss and accuracy 0.6905372643470764 0.0035962800000000003
loss 0.6947833442687988
test loss and accuracy 0.6894394063949585 0.0036276
loss 0.6867070269584655
test loss and accuracy 0.6881621956825257 0.00363664


In [77]:
train_loop(model, epochs=10, lr=0.005)

loss 0.7933547520637512
test loss and accuracy 0.6943894982337951 0.00855564
loss 0.6793985319137573
test loss and accuracy 0.6870004105567932 0.004048160000000001
loss 0.6667534232139587
test loss and accuracy 0.6876257109642029 0.00399256
loss 0.6726556468009949
test loss and accuracy 0.688574333190918 0.0043378
loss 0.656980881690979
test loss and accuracy 0.691621994972229 0.004226400000000001
loss 0.662782793045044
test loss and accuracy 0.69141667842865 0.00435188
loss 0.6409912300109863
test loss and accuracy 0.6912966108322144 0.0042403200000000005
loss 0.6514074206352234
test loss and accuracy 0.6970428490638733 0.0043565999999999995
loss 0.6712064123153687
test loss and accuracy 0.7102455520629882 0.00408972
loss 0.6411055088043213
test loss and accuracy 0.921364893913269 0.003679639999999999


In [78]:
train_loop(model, epochs=10, lr=0.001)

loss 0.8706991934776306
test loss and accuracy 0.7442146515846253 0.00587988
loss 0.693601312637329
test loss and accuracy 0.7111865544319153 0.007054760000000001
loss 0.6328119778633118
test loss and accuracy 0.7007159185409546 0.00786776
loss 0.6035014247894287
test loss and accuracy 0.695798864364624 0.00824464
loss 0.5871456170082092
test loss and accuracy 0.6939116406440735 0.00850172
loss 0.581382486820221
test loss and accuracy 0.6928787350654602 0.00868556
loss 0.5702439308166504
test loss and accuracy 0.6939883470535279 0.00876876
loss 0.5624044585227966
test loss and accuracy 0.6940492343902588 0.008844520000000002
loss 0.5531093716621399
test loss and accuracy 0.695306568145752 0.00885936
loss 0.5504364442825317
test loss and accuracy 0.6957342028617859 0.008918639999999999
