In [1]:
import os
import sklearn
import spacy
import string
import random
import torch
import pickle as pkl
import numpy as np
import collections
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from sklearn.model_selection import train_test_split

In [126]:
# Hyper-parameters
TOKEN_MODE = 'Plain'
NGRAM = 1
MAX_VOCAB_SIZE = 300000  # Set maximum vocabulary size
LEARNING_RATE = 0.01
NUM_EPOCHS = 3
EMBED_DIM = 100
OPTIMIZER = 'ADAM'
LAERNING_ANNEALING = 'YES'

In [3]:
# Initialize path to the directory where the files are stored
path_train_pos = '/train/pos'
path_train_neg = '/train/neg'
path_test_pos = '/test/pos'
path_test_neg = '/test/neg'
current_path = os.getcwd()

In [4]:
# Load files
train_pos = []
train_neg = []
test_pos = []
test_neg = []
train_label_pos = []
train_label_neg = []
test_label_pos = []
test_label_neg = []
os.chdir(current_path+path_train_pos)
for file in os.listdir():
    rate = file.strip('.txt').split('_')[-1]
    fin = open(file, 'r')
    train_pos.append(fin.read())
    train_label_pos.append(int(rate))
    fin.close()

os.chdir(current_path+path_train_neg)
for file in os.listdir():
    rate = file.strip('.txt').split('_')[-1]
    fin = open(file, 'r')
    train_neg.append(fin.read())
    train_label_neg.append(int(rate))
    fin.close()

os.chdir(current_path+path_test_pos)
for file in os.listdir():
    rate = file.strip('.txt').split('_')[-1]
    fin = open(file, 'r')
    test_pos.append(fin.read())
    test_label_pos.append(int(rate))
    fin.close()

os.chdir(current_path+path_test_neg)
for file in os.listdir():
    rate = file.strip('.txt').split('_')[-1]
    fin = open(file, 'r')
    test_neg.append(fin.read())
    test_label_neg.append(int(rate))
    fin.close()
    
# Check of the test has been correctly loaded
print('The length of the loaded files are train_pos {} train_neg {} test_pos {} test_neg {}'.format(len(train_pos),len(train_neg),len(test_pos),len(test_neg)))
print('The length of the loaded labels are train_label_pos {} train_label_neg {} test_label_pos {} test_label_neg {}'.format(len(train_label_pos),len(train_label_neg),len(test_label_pos),len(test_label_neg)))

The length of the loaded files are train_pos 12500 train_neg 12500 test_pos 12500 test_neg 12500
The length of the loaded labels are train_label_pos 12500 train_label_neg 12500 test_label_pos 12500 test_label_neg 12500


In [7]:
# Split the training set to training set and validation set
TRAIN_VAL_SPLIT = 10000
val_pos = []
val_neg = []
val_label_pos = []
val_label_neg = []
val_pos = train_pos[TRAIN_VAL_SPLIT:]
val_label_pos = train_label_pos[TRAIN_VAL_SPLIT:]
train_pos = train_pos[:TRAIN_VAL_SPLIT]
train_label_pos = train_label_pos[:TRAIN_VAL_SPLIT]

val_neg = train_neg[TRAIN_VAL_SPLIT:]
val_label_neg = train_label_neg[TRAIN_VAL_SPLIT:]
train_neg = train_neg[:TRAIN_VAL_SPLIT]
train_label_neg = train_label_neg[:TRAIN_VAL_SPLIT]

# Check the correctness of the split
print('The length of the loaded files are train_pos {} train_neg {} val_pos {} val_neg {}'.format(len(train_pos),len(train_neg),len(val_pos),len(val_neg)))
print('The length of the loaded files are train_label_pos {} train_label_neg {} val_label_pos {} val_label_neg {}'.format(len(train_label_pos),len(train_label_neg),len(val_label_pos),len(val_label_neg)))

The length of the loaded files are train_pos 10000 train_neg 10000 val_pos 2500 val_neg 2500
The length of the loaded files are train_label_pos 10000 train_label_neg 10000 val_label_pos 2500 val_label_neg 2500


In [9]:
# Combine positive and negative datasets
train_set = train_pos + train_neg
val_set = val_pos + val_neg
test_set = test_pos + test_neg

train_label = train_label_pos + train_label_neg
val_label = val_label_pos + val_label_neg
test_label = test_label_pos + test_label_neg

In [10]:
# Remove punctuations, stopwords and perform stemming
punctuations = string.punctuation
stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

def tokenize_sentence_advanced(inputStr, N):
    token_results = []
    tokens = word_tokenize(inputStr)
    tokens = [ps.stem(token.lower()) for token in tokens if (token not in punctuations and token not in stopwords)]
    if N == 1:
        return tokens
    else:
        counter = 0
        while counter < len(tokens)-N+1:
            temp = ''
            for i in range(N):
                temp += str(tokens[counter+i]) + ' '
            token_results.append(temp.strip(' '))
            counter += 1
        return token_results

In [11]:
# Tokenize all datasets for ngram
def tokenize_dataset_ngram(input_set, N):
    all_tokens = []
    set_tokens = []
    inner_counter = 1
    for review in input_set:
        token_temp = tokenize_sentence_advanced(review, N)
        all_tokens += token_temp
        set_tokens.append(token_temp)
        if inner_counter % 500 == 0:
            print('Finished tokenizing review {}'.format(inner_counter))
        inner_counter += 1
    return set_tokens, all_tokens

In [12]:
# Create Ngram advanced tokenization
for N in range(4):
    train_tokens, all_tokens = tokenize_dataset_ngram(train_set, N+1)
    val_tokens, _ = tokenize_dataset_ngram(val_set, N+1)
    test_tokens, _ = tokenize_dataset_ngram(test_set, N+1)
    os.chdir(current_path)
    pkl.dump(train_tokens, open("train_tokens_" + str(N+1) + '_full' + ".p", "wb"))
    pkl.dump(val_tokens, open("val_tokens_" + str(N+1) + '_full' + ".p", "wb"))
    pkl.dump(test_tokens, open("test_tokens_" + str(N+1) + '_full' + ".p", "wb"))
    pkl.dump(all_tokens, open("all_tokens_" + str(N+1) + '_full' + ".p", "wb"))
    print('\nFinished ' + str(N+1) + " gram\n")

Finished tokenizing review 500
Finished tokenizing review 1000
Finished tokenizing review 1500
Finished tokenizing review 2000
Finished tokenizing review 2500
Finished tokenizing review 3000
Finished tokenizing review 3500
Finished tokenizing review 4000
Finished tokenizing review 4500
Finished tokenizing review 5000
Finished tokenizing review 5500
Finished tokenizing review 6000
Finished tokenizing review 6500
Finished tokenizing review 7000
Finished tokenizing review 7500
Finished tokenizing review 8000
Finished tokenizing review 8500
Finished tokenizing review 9000
Finished tokenizing review 9500
Finished tokenizing review 10000
Finished tokenizing review 10500
Finished tokenizing review 11000
Finished tokenizing review 11500
Finished tokenizing review 12000
Finished tokenizing review 12500
Finished tokenizing review 13000
Finished tokenizing review 13500
Finished tokenizing review 14000
Finished tokenizing review 14500
Finished tokenizing review 15000
Finished tokenizing review 155

Finished tokenizing review 1500
Finished tokenizing review 2000
Finished tokenizing review 2500
Finished tokenizing review 3000
Finished tokenizing review 3500
Finished tokenizing review 4000
Finished tokenizing review 4500
Finished tokenizing review 5000
Finished tokenizing review 5500
Finished tokenizing review 6000
Finished tokenizing review 6500
Finished tokenizing review 7000
Finished tokenizing review 7500
Finished tokenizing review 8000
Finished tokenizing review 8500
Finished tokenizing review 9000
Finished tokenizing review 9500
Finished tokenizing review 10000
Finished tokenizing review 10500
Finished tokenizing review 11000
Finished tokenizing review 11500
Finished tokenizing review 12000
Finished tokenizing review 12500
Finished tokenizing review 13000
Finished tokenizing review 13500
Finished tokenizing review 14000
Finished tokenizing review 14500
Finished tokenizing review 15000
Finished tokenizing review 15500
Finished tokenizing review 16000
Finished tokenizing review 

In [14]:
# Extract stored N-Gram pickle and load into program
def extract_n_gram_tokens_full(N):
    os.chdir(current_path)
    train_tokens = pkl.load(open(current_path + "/train_tokens_" + str(N) + '_full' + ".p", "rb"))
    val_tokens = pkl.load(open(current_path + "/val_tokens_" + str(N) + '_full' + ".p", "rb"))
    test_tokens = pkl.load(open(current_path + "/test_tokens_" + str(N) + '_full' + ".p", "rb"))
    all_tokens = pkl.load(open(current_path + "/all_tokens_" + str(N) + '_full' + ".p", "rb"))
    return train_tokens, val_tokens, test_tokens, all_tokens

In [85]:
# Extract 1gram 
train_tokens, val_tokens, test_tokens, all_tokens = extract_n_gram_tokens_full(1)

In [19]:
# Extract 2gram
train_tokens, val_tokens, test_tokens, all_tokens = extract_n_gram_tokens_full(2)

In [15]:
# Extract 3gram
train_tokens, val_tokens, test_tokens, all_tokens = extract_n_gram_tokens_full(3)

In [None]:
# Extract 4gram
train_tokens, val_tokens, test_tokens, all_tokens = extract_n_gram_tokens_full(4)

In [86]:
Counter(all_tokens).most_common(30)

[('br', 81029),
 ('i', 65136),
 ("'s", 49454),
 ('movi', 39569),
 ('film', 37063),
 ('the', 36204),
 ("''", 26615),
 ("n't", 26558),
 ('``', 26402),
 ('one', 21423),
 ('like', 17638),
 ('it', 15315),
 ('time', 12214),
 ('thi', 11994),
 ('good', 11821),
 ('make', 11541),
 ('get', 11162),
 ('see', 11130),
 ('charact', 11012),
 ('watch', 10888),
 ('would', 10761),
 ('stori', 10159),
 ('even', 10103),
 ('...', 9679),
 ('realli', 9332),
 ('scene', 8099),
 ('show', 8014),
 ('well', 7902),
 ('look', 7783),
 ('much', 7752)]

#### Build Vocabulary

In [127]:
# Reserve special index for padding and unknown words
PAD_IDX = 0
UNK_IDX = 1

# The method to build vocabulary
def build_vocab(all_tokens):
    token_counter = Counter(all_tokens)
    tokens, frequency = zip(*token_counter.most_common(MAX_VOCAB_SIZE))
    id2token = list(tokens)
    token2id = dict(zip(tokens, range(2,2+len(tokens))))
    id2token = ['<pad>', '<unk>'] + id2token
    token2id['<pad>'] = PAD_IDX 
    token2id['<unk>'] = UNK_IDX
    return id2token, token2id

id2token, token2id = build_vocab(all_tokens) 

In [128]:
# Lets check the dictionary by loading random token from it

random_token_id = random.randint(0, len(id2token)-1)
random_token = id2token[random_token_id]

print ("Token id {} ; token {}".format(random_token_id, id2token[random_token_id]))
print ("Token {}; token id {}".format(random_token, token2id[random_token]))

Token id 6300 ; token gloss
Token gloss; token id 6300


In [129]:
# Convert datasets in token format to index format
def convert_token_index(set_token, token2id, id2token):
    set_idx = []
    count = 0
    for review in set_token:
        temp_list = []
        temp_list = [token2id[token] if token in token2id else UNK_IDX for token in review]
        set_idx.append(temp_list)
        count += 1
        if count % 200 == 0:
            print('finished review NO ' + str(count))
    return set_idx

# Check the train_idx 
train_idx = convert_token_index(train_tokens, token2id, id2token)
print('The length of train_idx is {}'.format(len(train_idx)))

finished review NO 200
finished review NO 400
finished review NO 600
finished review NO 800
finished review NO 1000
finished review NO 1200
finished review NO 1400
finished review NO 1600
finished review NO 1800
finished review NO 2000
finished review NO 2200
finished review NO 2400
finished review NO 2600
finished review NO 2800
finished review NO 3000
finished review NO 3200
finished review NO 3400
finished review NO 3600
finished review NO 3800
finished review NO 4000
finished review NO 4200
finished review NO 4400
finished review NO 4600
finished review NO 4800
finished review NO 5000
finished review NO 5200
finished review NO 5400
finished review NO 5600
finished review NO 5800
finished review NO 6000
finished review NO 6200
finished review NO 6400
finished review NO 6600
finished review NO 6800
finished review NO 7000
finished review NO 7200
finished review NO 7400
finished review NO 7600
finished review NO 7800
finished review NO 8000
finished review NO 8200
finished review NO 8

In [130]:
# Convert val_tokens and test_tokens to idx sets
val_idx = convert_token_index(val_tokens, token2id, id2token)
print('The length of val_idx is {}'.format(len(val_idx)))

test_idx = convert_token_index(test_tokens, token2id, id2token)
print('The length of test_idx is {}'.format(len(test_idx)))

finished review NO 200
finished review NO 400
finished review NO 600
finished review NO 800
finished review NO 1000
finished review NO 1200
finished review NO 1400
finished review NO 1600
finished review NO 1800
finished review NO 2000
finished review NO 2200
finished review NO 2400
finished review NO 2600
finished review NO 2800
finished review NO 3000
finished review NO 3200
finished review NO 3400
finished review NO 3600
finished review NO 3800
finished review NO 4000
finished review NO 4200
finished review NO 4400
finished review NO 4600
finished review NO 4800
finished review NO 5000
The length of val_idx is 5000
finished review NO 200
finished review NO 400
finished review NO 600
finished review NO 800
finished review NO 1000
finished review NO 1200
finished review NO 1400
finished review NO 1600
finished review NO 1800
finished review NO 2000
finished review NO 2200
finished review NO 2400
finished review NO 2600
finished review NO 2800
finished review NO 3000
finished review NO

#### Model Establishment

In [170]:
# Find out average sentence lengths
total = 0
for review in train_idx:
    total += len(review)
print(float(total) / len(train_idx))

143.1021


In [220]:
# Set up maximum sentence length for the use of standardizing matirx format later in the program
MAX_SENTENCE_LENGTH = 500

# Set up local class of dataset to use in this program 
from torch.utils.data import Dataset

class localDataset(Dataset):
    
    def __init__(self, data_idx, label_list):
        self.data_idx = data_idx
        self.label_list = label_list
        assert(len(self.data_idx) == len(self.label_list))
    def __len__(self):
        return len(self.data_idx)
    def __getitem__(self, key):
        review_idx = self.data_idx[key][:MAX_SENTENCE_LENGTH]
        label = self.label_list[key]
        return [review_idx, len(review_idx), label]

def collate_fn(batch):
    idx_list = []
    label_list = []
    length_list = []
    for review in batch:
        label_list.append(review[2])
        length_list.append(review[1])
        padded_review = np.pad(np.array(review[0]),pad_width=((0,MAX_SENTENCE_LENGTH-review[1])),mode="constant", constant_values=0)
        idx_list.append(padded_review)
    return [torch.from_numpy(np.array(idx_list)), torch.LongTensor(length_list), torch.LongTensor(label_list)]

In [221]:
# Create local dataset
train_dataset = localDataset(train_idx, train_label)
val_dataset = localDataset(val_idx, val_label)
test_dataset = localDataset(test_idx, test_label)

In [222]:
# Create data loader
BATCH_SIZE = 64
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_fn,
                                           shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_fn,
                                           shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_fn,
                                           shuffle=True)

In [223]:
CLASSIFIER_MODE = 'score'
# Establish the BagOfWords Model
import torch.nn as nn
import torch.nn.functional as F

class BagOfWords(nn.Module):
    def __init__(self, review_length, embed_dim):
        super(BagOfWords, self).__init__()
        self.embedding = nn.Embedding(review_length, embed_dim, padding_idx = 0)
        self.linearReg1 = nn.Linear(embed_dim, 50)
        self.linearReg2 = nn.Linear(50, 11)
    def forward(self, idx_list, length_list):
        embed_result_words = self.embedding(idx_list)
        embed_result_review = torch.sum(embed_result_words, dim = 1)
        embed_result = embed_result_review / length_list.view(length_list.size()[0],1).expand_as(embed_result_review).float()
        
        result = F.relu(self.linearReg1(embed_result.float()))
        result = self.linearReg2(result.float())
        return result

##### Without Learning Rate Annealing 

In [95]:
model = BagOfWords(len(id2token), EMBED_DIM)

In [96]:
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
if OPTIMIZER == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
    
# Function for testing the model
def test_model(loader, model):
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(NUM_EPOCHS):
    for i, (data, lengths, labels) in enumerate(train_loader):
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, NUM_EPOCHS, i+1, len(train_loader), val_acc))

final_accuracy = test_model(val_loader, model)
print('Final Validation Acc: {}'.format( val_acc))

Epoch: [1/3], Step: [101/313], Validation Acc: 36.54
Epoch: [1/3], Step: [201/313], Validation Acc: 38.32
Epoch: [1/3], Step: [301/313], Validation Acc: 39.18
Epoch: [2/3], Step: [101/313], Validation Acc: 38.38
Epoch: [2/3], Step: [201/313], Validation Acc: 38.24
Epoch: [2/3], Step: [301/313], Validation Acc: 37.74
Epoch: [3/3], Step: [101/313], Validation Acc: 36.96
Epoch: [3/3], Step: [201/313], Validation Acc: 33.02
Epoch: [3/3], Step: [301/313], Validation Acc: 36.14
Final Validation Acc: 36.14


##### With Learning Rate Annealing 

In [224]:
model = BagOfWords(len(id2token), EMBED_DIM)

In [226]:
from torch.optim.lr_scheduler import LambdaLR
lambda1 = lambda epoch: 0.3**epoch
# Criterion and Optimizer
criterion = torch.nn.CrossEntropyLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
if OPTIMIZER == 'SGD':
    optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

    
scheduler = LambdaLR(optimizer, lr_lambda=[lambda1])
# Function for testing the model
def test_model(loader, model):
    correct = 0
    total = 0
    model.eval()
    for data, lengths, labels in loader:
        data_batch, length_batch, label_batch = data, lengths, labels
        outputs = F.softmax(model(data_batch, length_batch), dim=1)
        predicted = outputs.max(1, keepdim=True)[1]
        
        total += labels.size(0)
        correct += predicted.eq(labels.view_as(predicted)).sum().item()
    return (100 * correct / total)

for epoch in range(NUM_EPOCHS):
    scheduler.step()
    for i, (data, lengths, labels) in enumerate(train_loader):
        #scheduler.step()
        model.train()
        data_batch, length_batch, label_batch = data, lengths, labels
        optimizer.zero_grad()
        outputs = model(data_batch, length_batch)
        loss = criterion(outputs, label_batch)
        loss.backward()
        optimizer.step()
        # validate every 100 iterations
        if i > 0 and i % 100 == 0:
            # validate
            val_acc = test_model(val_loader, model)
            print('Epoch: [{}/{}], Step: [{}/{}], Validation Acc: {}'.format( 
                       epoch+1, NUM_EPOCHS, i+1, len(train_loader), val_acc))

final_accuracy = test_model(val_loader, model)
print('Final Validation Acc: {}'.format(final_accuracy))

Epoch: [1/3], Step: [101/313], Validation Acc: 37.64
Epoch: [1/3], Step: [201/313], Validation Acc: 37.48
Epoch: [1/3], Step: [301/313], Validation Acc: 37.06
Epoch: [2/3], Step: [101/313], Validation Acc: 36.5
Epoch: [2/3], Step: [201/313], Validation Acc: 36.18
Epoch: [2/3], Step: [301/313], Validation Acc: 35.78


KeyboardInterrupt: 