In [1]:
import torch
import torch.nn as nn
import string  


from gensim.models import KeyedVectors
from torch.nn import Dropout
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk import word_tokenize
from collections import Counter
from itertools import chain



In [2]:
glove2word2vec(glove_input_file="/Users/macbook/Downloads/glove.840B.300d 2.txt", word2vec_output_file="gensim_glove_vectors.txt")

(2196018, 300)

In [3]:
glove_model = KeyedVectors.load_word2vec_format("/Users/macbook/PycharmProjects/nlp-course/workshops/gensim_glove_vectors.txt", binary=False, limit = 200000)



In [4]:
w2vmodel = KeyedVectors.load_word2vec_format ( "/Users/macbook/Downloads/GoogleNews-vectors-negative300.bin" , binary = True , limit = 200000)


In [5]:
reviews_train = []
for line in open("/Users/macbook/aclImdb/movie_data/full_train.txt", "r"):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open("/Users/macbook/aclImdb/movie_data/full_test.txt", "r"):
    reviews_test.append(line.strip())

In [6]:

train_target = [1 if i < 12500 else 0 for i in range(25000)]
test_target = [1 if i < 12500 else 0 for i in range(25000)]

In [7]:
#train_tokens = list(chain(*[[token for token in sample.lower().split()] for sample in reviews_train]))

In [8]:
def clean_token(inp_token):
    for mark in string.punctuation:
        inp_token = inp_token.replace(mark, "")
    return inp_token

In [9]:
train_tokens_new = list(chain(*[[clean_token(token) for token in word_tokenize(sample.lower())] for sample in reviews_train]))

In [10]:
train_vocabulary = Counter(train_tokens_new)

In [11]:
UNKNOWN_TOKEN = "unknown"
PAD_TOKEN = "PAD"

index_to_token = [UNKNOWN_TOKEN] + [PAD_TOKEN] + list(train_vocabulary.keys())
token_to_index = {token: index + 2 for index, token in enumerate(train_vocabulary.keys())}
token_to_index[UNKNOWN_TOKEN] = 0
token_to_index[PAD_TOKEN] = 1

In [15]:
oov = Counter([token for token in train_tokens_new if token not in w2vmodel])
print(len(oov))
oov.most_common(10)


60588


[('', 1409698),
 ('and', 222311),
 ('a', 216896),
 ('of', 196057),
 ('to', 178852),
 ('10', 2889),
 ('\x96', 1809),
 ('robert', 1328),
 ('richard', 1182),
 ('george', 1180)]

In [16]:
EMBEDDING_DIM = w2vmodel.vector_size
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 100


In [17]:
def generate_batch(input_data, max_length = MAX_INPUT_LENGTH):
    texts = torch.tensor([padding(sample["text"], max_length) for sample in input_data], dtype=torch.long)
    labels = torch.tensor([sample["label"] for sample in input_data], dtype=torch.long)
    return texts, labels

def padding(text_tokens, max_length, padding_token = 0):
    if len(text_tokens) >= max_length:
        return text_tokens[:max_length]
    return text_tokens + [padding_token]*(max_length - len(text_tokens))

In [18]:
prepared_data = []

for label, text in zip(train_target, reviews_train):
    text_tokens = [token_to_index[clean_token(token)] for token in word_tokenize(text.lower())]
    prepared_data.append({"label": label, "text": text_tokens})

In [19]:
class SentimentClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.FloatTensor(w2vmodel.vectors))
        self.fc1 = nn.Linear(embed_dim, embed_dim)
        self.fc2 = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        init_range = 0.5
        self.fc1.weight.data.uniform_(-init_range, init_range)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-init_range, init_range)
        self.fc2.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text)
        Dropout(0.2)
        fc1_output = self.fc1(embedded)
        pooled_output, _ = fc1_output.max(dim=1)
        return self.fc2(pooled_output)

In [20]:
from torch.utils.data import DataLoader

def train(input_data):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(input_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    for i, (text, label) in enumerate(data):
        optimizer.zero_grad()
        text, label = text.to(device), label.to(device)
        output = model(text)
        loss = criterion(output, label)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == label).sum().item()

    return train_loss / len(input_data), train_acc / len(input_data)

def test(input_data):
    loss = 0
    acc = 0
    data = DataLoader(input_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, label in data:
        text, label = text.to(device), label.to(device)
        with torch.no_grad():
            output = model(text)
            loss = criterion(output, label)
            loss += loss.item()
            acc += (output.argmax(1) == label).sum().item()

    return loss / len(input_data), acc / len(input_data)

In [21]:
import random
import numpy

def set_seed(seed: int, n_gpu: int):
    random.seed(seed)
    numpy.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)
        
set_seed(42, 1)


In [22]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [23]:
model = SentimentClassificationModel(vocab_size=len(index_to_token), embed_dim=EMBEDDING_DIM, num_class=len(set(train_target))).to(device)


In [29]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.07)

train_len = int(len(prepared_data) * 0.95)
train_data, validation_data = \
    random_split(prepared_data, [train_len, len(prepared_data) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(train_data)
    valid_loss, valid_acc = test(validation_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')


Epoch: 1  | time in 0 minutes, 18 seconds
	Loss: 0.0088(train)	|	Acc: 70.2%(train)
	Loss: 0.0010(valid)	|	Acc: 66.2%(valid)
Epoch: 2  | time in 0 minutes, 18 seconds
	Loss: 0.0086(train)	|	Acc: 70.9%(train)
	Loss: 0.0011(valid)	|	Acc: 67.9%(valid)
Epoch: 3  | time in 0 minutes, 22 seconds
	Loss: 0.0084(train)	|	Acc: 71.9%(train)
	Loss: 0.0010(valid)	|	Acc: 71.6%(valid)
Epoch: 4  | time in 0 minutes, 27 seconds
	Loss: 0.0083(train)	|	Acc: 72.8%(train)
	Loss: 0.0010(valid)	|	Acc: 71.5%(valid)
Epoch: 5  | time in 0 minutes, 20 seconds
	Loss: 0.0081(train)	|	Acc: 73.4%(train)
	Loss: 0.0010(valid)	|	Acc: 70.8%(valid)


In [25]:
prepared_test_data = []

for label, text in zip(test_target, reviews_test):
    text_tokens = [token_to_index.get(clean_token(token), 0) for token in word_tokenize(text.lower())]
    prepared_test_data.append({"label": label, "text": text_tokens})

In [30]:
test(prepared_test_data)

(tensor(3.9672e-05), 0.61976)