In [14]:
import torch
import pandas as pd
import os
import pickle
from torch.utils.data import Dataset, DataLoader
import numpy as np

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


In [2]:
import spacy
nlp = spacy.load('en_core_web_sm', disable=['tagger', 'ner', 'parser'])

In [3]:
dataDir = '../Data'

In [4]:
os.listdir(dataDir)

['idx2word.pkl', 'test.csv', 'train.csv', 'aclImdb', 'word2idx.pkl']

In [5]:
columns = ['Index', 'Sentence', 'Rating', 'isPositive', 'SentVec']


In [6]:
df = pd.read_table(dataDir+'/train.csv', names=columns)

#shuffle the rows inplace and reset the index
df = df.sample(frac=1).reset_index(drop=True)


In [7]:
df

Unnamed: 0,Index,Sentence,Rating,isPositive,SentVec
0,10179,This movie is a disgrace to the Major League F...,1,0,"[11, 17, 7, 4, 6049, 6, 2, 668, 2744, 3158, 9,..."
1,52,This movie is incredible If you have the chanc...,10,1,"[11, 17, 7, 1038, 48, 21, 27, 2, 571, 108, 10,..."
2,11126,Of all the movies Ive seen this one rates almo...,2,0,"[5, 31, 2, 98, 9, 143, 112, 11, 29, 5145, 218,..."
3,5257,For a made for TV horror movie the movie start...,3,0,"[15, 4, 96, 15, 237, 190, 17, 2, 17, 638, 126,..."
4,1607,This documentary is a proof of talent being us...,1,0,"[11, 651, 7, 4, 3154, 5, 666, 113, 339, 15, 38..."
5,2769,This film has all the size and grandeur of man...,10,1,"[11, 19, 46, 31, 2, 3559, 3, 11117, 5, 111, 5,..."
6,8057,In the mid 1930s Hollywood was regaining its c...,10,1,"[8, 2, 1664, 2587, 358, 13, 43560, 30, 4440, 1..."
7,5027,I gave this movie a 10 because it needed to be...,10,1,"[9, 510, 11, 17, 4, 161, 92, 10, 873, 6, 28, 7..."
8,3148,Essentially a story of man versus nature this ...,4,0,"[1991, 4, 66, 5, 132, 3999, 859, 11, 19, 46, 3..."
9,9911,Rosalind Russell executes a power house perfor...,10,1,"[20708, 2644, 17798, 4, 654, 302, 236, 14, 108..."


In [8]:
loc = 5
print(str(df.iloc[loc]['Sentence']) +'\n' + str(df.iloc[loc]['isPositive']))

This film has all the size and grandeur of many of the great biblical epics of the 1950s and 60s But it is also perhaps the first that really humanizes the biblical characters themselves The best thing about it is that it does not diminish them in the eyes of the viewer This is a unique and compelling balance that helps us to realize that even great people like David are flawed people who find their faith and greatness in facing their flaws The actors are all first rate in the film from Gilbert Barnett as Davids second son Absolom through to the wonderful Susan Hayward as Bathsheba Hayward is at her best in this film Her own truthful but larger than life style of acting is quite at home here She is ever the seductress but she plays the role in such a way that you sympathize with her Raymond Massey does a great job as Nathan the prophet As a child when I first saw the film Massey seemed like he truly had just conversed with the Lord himself and was an awesome sight No doubt helped also 

In [9]:
print(dataDir)

word2idx = pickle.load(open(f'{dataDir}/word2idx.pkl', 'rb'))
idx2word = pickle.load(open(f'{dataDir}/idx2word.pkl', 'rb'))

../Data


In [10]:

class VectorizeData(Dataset):
    def __init__(self, df, maxlen=100):
        self.maxlen = maxlen
        self.df = df
        vectors = []
        for s in df.Sentence:
            vector = [word2idx[w.text.lower()] for w in nlp(s)]
            vectors.append(self.pad_data(vector))
        print('padded..')
        df['sentimentpadded'] = vectors
            
    def __len__(self):
        return self.df.shape[0]
    
    def __getitem__(self, idx):
        X = self.df.sentimentpadded[idx]
        lens = len(X)
        y = self.df.isPositive[idx]
        return X,y,lens
    
    def pad_data(self, s):
        padded = np.zeros((self.maxlen,), dtype=np.int64)
        if len(s) > self.maxlen: padded[:] = s[:self.maxlen]
        else: padded[:len(s)] = s
        return padded

In [11]:
ds = VectorizeData(df, 300)

padded..


In [13]:
dl = DataLoader(dataset=ds, batch_size=3)
print(len(dl))
# 8334

it = iter(dl)
xs,ys,lens =  next(it)
print(type(xs))
print(xs.shape)
print(lens)

8334
<class 'torch.Tensor'>
torch.Size([3, 300])
tensor([300, 300, 300])


In [80]:
xs,ys,lens =  next(it)
print(type(xs))
print(xs)
print(lens)

<class 'torch.Tensor'>
tensor([[   11,    17,     7,   521,   281,     8,    10,    46,    82,   222,
           160,    40,     7,   162,   204,     9,    64,    23,   741,    54,
           834,    44,    22,   215,    58,    16,    72,   205,  1741,     4,
           246,   165,     4,   645,    20,    41,  4846,  3355,     3,  5207,
            58,    14,    35,  1117,  7961,   929,     4,   175,    39,     4,
            17,     8,     2,   823,   439,   195,    49,     7,    62,    53,
           611,     7,  2099, 22241,   117,    56,    77,    22,   502,    39,
            56,     7,  2023,    56,    43,  1404,    39,    56,     7,   377,
            37,    97,    20,    21,    64,   478,     2,   223,   114,    56,
           211,     4,    51,   288,     3,    10,    43,    38,   565,    12,
             4,   462,    56,    70,    32,  2023,   490,    16,    41,  1375,
            93,  1184,    18, 53675,   462,     7,    84,     3,   499,    20,
            41,   265,     6,

In [18]:
class SimpleLSTM(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, n_hidden, n_out):
        super().__init__()
        self.vocab_size, self.embedding_dim, self.n_hidden, self.n_out = vocab_size, embedding_dim, n_hidden, n_out
        self.emb = nn.Embedding(self.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(self.embedding_dim, self.n_hidden)
        self.out = nn.Linear(self.n_hidden, self.n_out)
        
    def forward(self, seq, length):
        #get batch size
        bs = seq.size(1)
        #init hidden layer
        #self.h = self.init_hidden(bs)
        embs = self.embs(seq)
        lstm_out, hidden = self.lstm(embs)
        
        outp = self.out(hidden[-1])
        return F.log_softmax(outp, dim=-1)
    


In [19]:
INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = SimpleLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)