In [1]:
import os
import spacy
import json
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import gensim
from gensim import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
nlp = spacy.load("en")
nlp.max_length = 3100000

In [2]:
negList = []
for file in os.listdir("./neg"):
    if ".txt" in file:
        f = open("./neg/"+file)
        negList.append(f.read())
print("finished loading negative data...")

finished loading negative data...


In [3]:
posList = []
for file in os.listdir("./pos"):
    if ".txt" in file:
        f = open("./pos/"+file)
        posList.append(f.read())
print("finished loading positive data...")

finished loading positive data...


In [81]:
negTokenList = []
wordDict = {}
for review in negList:
    doc = nlp(review)
    reviewTokenized = ""
    for token in doc:
        if token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "SYM":
            word = token.text.lower()
            reviewTokenized += " " + word
            count = wordDict.get(word,0)
            wordDict[word] = count+1
    negTokenList.append(reviewTokenized[1:])

In [82]:
posTokenList = []
for review in posList:
    doc = nlp(review)
    reviewTokenized = ""
    for token in doc:
        if token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "SYM":
            word = token.text.lower()
            reviewTokenized += " " + word
            count = wordDict.get(word,0)
            wordDict[word] = count+1
    posTokenList.append(reviewTokenized[1:])

In [83]:
with open("negTokenList.json","w") as fp:
    json.dump(negTokenList,fp)
with open("posTokenList.json","w") as fp:
    json.dump(posTokenList,fp)
with open("wordCount.json","w") as fp:
    json.dump(wordDict,fp)

In [84]:
negTokenList = []
posTokenList = []
wordDict = {}
with open("negTokenList.json","r") as fp:
    negTokenList = json.load(fp)
with open("posTokenList.json","r") as fp:
    posTokenList = json.load(fp)
with open("wordCount.json","r") as fp:
    wordDict = json.load(fp)

In [6]:
allReviews = negTokenList + posTokenList

In [7]:
sortedWords = sorted(wordDict.items(),key = lambda x : x[1],reverse=True)
vocabIndex={w:i+1 for i,(w,c) in enumerate(sortedWords)}

In [8]:
encodedReviews=list()
for r in allReviews:
    encodedReview=list()
    for word in r.split(" "):
         #if word is not available in vocab_to_int put 0 in that place else use the index
        encodedReview.append(vocabIndex.get(word,0))
    encodedReviews.append(encodedReview)

## Neural Net

In [15]:
class NNcustom(nn.Module):
    def __init__(self, input_dim, layer_dims, output_dim, activation):
        super(NNcustom, self).__init__()
        self.num_layer = len(layer_dims)
        self.activation = activation
        if activation is not "LINEAR":
            self.act = self.getActivation(activation)

        self.inLayer = nn.Linear(input_dim, layer_dims[0])
        self.linears = nn.ModuleList([self.act])
        for i in range(self.num_layer - 1):
            self.linears.extend([nn.Linear(layer_dims[i],layer_dims[i+1])])
        self.outLayer = nn.Linear(layer_dims[self.num_layer - 1], output_dim)
        self.lastactivation = nn.Sigmoid()

    def getActivation(self, activation):
        if(activation is "RELU"):
            return nn.ReLU()
        if(activation is "SIG"):
            return nn.Sigmoid()
        if (activation is "TANH"):
            return nn.Tanh()

    def forward(self, x):
        out = self.inLayer(x)
        for l in self.linears:
            out = l(out)
            if self.activation is not "LINEAR":
                out = self.act(out)
        out = self.outLayer(out)
        out = self.lastactivation(out)
        return out

In [16]:
def train(model,train_x_nn,train_y_nn,epoch=3,itr=1000):
    learning_rate = 0.02
    error = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    for e in range(epoch):
        for t in range(itr):
            out = model(train_x_nn)
            loss = error(out, train_y_nn)
            optimizer.zero_grad()
            loss.backward(retain_graph=True)
            optimizer.step()
        print('Epoch [%d/%d] ========== loss: %.3f' %(e + 1, epoch,loss.item()))

In [17]:
def predict(model,test_x_nn,test_y_nn):
    correct = 0
    total = 0
    with torch.no_grad():
        outputs = model(test_x_nn)
        _, predicted = torch.max(outputs, 1)
        total += test_y_nn.size(0)
        correct += (predicted == test_y_nn).sum().item()
        return(100 * correct / total)

In [18]:
def tenFoldPartition(i,x_data,y_data):
    if i < 0:
        return x_data, y_data,x_data,y_data
    test_to = int(1000 * i)
    test_from = int(1000 * (i + 1))

    train_x_1=x_data[:test_to]
    train_x_2=x_data[test_from:]
    train_x = np.concatenate((train_x_1,train_x_2))
    
    train_y_1=y_data[:test_to]
    train_y_2=y_data[test_from:]
    train_y=np.concatenate((train_y_1,train_y_2))

    test_x=x_data[test_to:test_from]
    test_y=y_data[test_to:test_from]
    return train_x, train_y, test_x, test_y

Validation

In [2]:
# 10 fold cross validation

def tenFoldTrain(x_data,y_data):
    input_dim = 300
    output_dim = 2
    layerList = [[200,100,10],[150,75,10],[100,50,10]]
    actList = ["RELU","TANH","SIG"]
    for layer_dims in layerList:
        for act in actList:
            for i in range(0,10):
                train_x,train_y,test_x,test_y = tenFoldPartition(i,x_data,y_data)
                print("Validation data " + str(i*1000) + " " + str((i+1)*1000))
                train_x_nn = torch.from_numpy(train_x).float()
                train_y_nn = Variable(torch.as_tensor(train_y,dtype=torch.long).long())
                test_x_nn = torch.from_numpy(test_x).float()
                test_y_nn = Variable(torch.as_tensor(test_y,dtype=torch.long).long())
                model = NNcustom(input_dim, layer_dims, output_dim,act)
                learning_rate = 0.02
                error = nn.CrossEntropyLoss()
                optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
                print("Training with layers size ",layer_dims, " and activation function ", act);
                train(model,train_x_nn,train_y_nn,6,5000)
                accuracy = predict(model,test_x_nn,test_y_nn)
                print("Accuracy: " + str(accuracy))

In [9]:
def runTrainingFullData(x_data,y_data):
    input_dim = 300
    output_dim = 2
    layer_dims = [150,75,10]
    act = "TANH"
    print("Training on complete data with ", act," activation function and ",layer_dims, "as size of intermediate layers")
    train_x_nn = torch.from_numpy(x_data).float()
    train_y_nn = Variable(torch.as_tensor(y_data,dtype=torch.long).long())
    test_x_nn = torch.from_numpy(x_data).float()
    test_y_nn = Variable(torch.as_tensor(y_data,dtype=torch.long).long())
    model = NNcustom(input_dim, layer_dims, output_dim,act)
    train(model,train_x_nn,train_y_nn,10,10000)
    accuracy = predict(model,test_x_nn,test_y_nn)
    print("Accuracy: " + str(accuracy))
    return model

# Unigram Features
Q3.1

In [46]:
sequence_length=300
features=np.zeros((len(encodedReviews), sequence_length), dtype=int)
for i, review in enumerate(encodedReviews):
    review_len=len(review)
    if (review_len<=sequence_length):
        zeros=list(np.zeros(sequence_length-review_len))
        new=review+zeros
    else:
        new=review[:sequence_length]
    features[i,:]=np.array(new)
z = [0]*5000
o = [1]*5000
labels = np.array(z+o)

In [47]:
tenFoldTrain(features,labels)

Validation data 0 1000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 49.4
Validation data 1000 2000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 48.9
Validation data 2000 3000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 46.6
Validation data 3000 4000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 49.0
Validation data 4000 5000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 46.2
Validation data 5000 6000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 45.3
Validation data 6000 7000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 46.6
Validation data 7000 8000
Training with layers size  [200, 50, 10]  and activation function  TANH
Accuracy: 46.8
Validation data 8000 9000
Training with layers size  [200, 50, 10]  and activation function  TANH
A

In [33]:
unigramModel = runTrainingFullData(features,labels)

Training on complete data with  TANH  activation function and  [150, 75, 10] as size of intermediate layers
Accuracy: 93.24


## Word2Vec Encodings
Q3.3

In [52]:
word2VecModel = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [53]:
w2vEncodedReviews=list()
for r in allReviews:
    w2v=list()
    for word in r.split(" "):
        vec = np.zeros(300)
        if word in word2VecModel:
            vec = vec + word2VecModel[word]
    w2vEncodedReviews.append(vec/(len(r.split(" "))))
w2vfeatures=np.array(w2vEncodedReviews)

z = [0]*5000
o = [1]*5000
labels = np.array(z+o)

In [None]:
tenFoldTrain(w2vfeatures,labels)

In [59]:
word2VecModel = runTrainingFullData(w2vfeatures,labels)

Training on complete data with  TANH  activation function and  [150, 75, 10] as size of intermediate layers
Accuracy: 60.4


# TF-IDF SVD
Q3.4

In [60]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(allReviews)

In [7]:
def tenFoldTrainSVD(x_data,y_data):
    n_components = [300,200,50,10]
    output_dim = 2
    layer_dims = [150,75,10]
    actList = ["RELU"]
    for n in n_components:
        svd = TruncatedSVD(n_components=n, n_iter=7, random_state=42)
        svdOut = svd.fit_transform(x_data)
        input_dim = n
        for act in actList:
            for i in range(0,10):
                train_x,train_y,test_x,test_y = tenFoldPartition(i,svdOut,y_data)
                print("Validation data " + str(i*1000) + " " + str((i+1)*1000))
                train_x_nn = torch.from_numpy(train_x).float()
                train_y_nn = Variable(torch.as_tensor(train_y,dtype=torch.long))
                test_x_nn = torch.from_numpy(test_x).float()
                test_y_nn = Variable(torch.as_tensor(test_y,dtype=torch.long).long())
                model = NNcustom(input_dim, layer_dims, output_dim,act)
                print("Training with layers size ",layer_dims, " and activation function ", n, " components");
                train(model,train_x_nn,train_y_nn,5,5000)
                accuracy = predict(model,test_x_nn,test_y_nn)
                print("Accuracy: " + str(accuracy))

In [8]:
def runTrainingFullDataSVD(x_data,y_data):
    n = 300
    input_dim = 300
    output_dim = 2
    layer_dims = [150,75,10]
    svd = TruncatedSVD(n_components=n, n_iter=7, random_state=42)
    x_data = svd.fit_transform(x_data)
    act = "RELU"
    print("Training on complete data with ",act," activation function and ",layer_dims, "as size of intermediate layers and ",n,"components")
    train_x_nn = torch.from_numpy(x_data).float()
    train_y_nn = Variable(torch.as_tensor(y_data,dtype=torch.long).long())
    test_x_nn = torch.from_numpy(x_data).float()
    test_y_nn = Variable(torch.as_tensor(y_data,dtype=torch.long).long())
    model = NNcustom(input_dim, layer_dims, output_dim,act)
    train(model,train_x_nn,train_y_nn,7,10000)
    accuracy = predict(model,test_x_nn,test_y_nn)
    print("Accuracy: " + str(accuracy))
    return model

In [None]:
tenFoldTrainSVD(X,labels)

In [127]:
svdModel = runTrainingFullDataSVD(X,labels)

Training on complete data with  RELU  activation function and  [150, 75, 10] as size of intermediate layers and  300 components
Accuracy: 97.07


In [119]:
print("Top 5 topics with most important words")
for j in range(0,5):
    print("Topic ",j+1)
    for i,index in enumerate(svd.components_[j].argsort()[:20]):
        print(i+1,vectorizer.get_feature_names()[index])

Top 5 topics with most important words
Topic  1
1 surtout
2 beaucoup
3 bruyant
4 recibir
5 dijeron
6 nuevo
7 diferente
8 pronto
9 unas
10 ea
11 arrepentir
12 propio
13 suceda
14 algo
15 tuvimos
16 pidió
17 empleadas
18 ordenar
19 riendo
20 buenísimo
Topic  2
1 is
2 you
3 are
4 place
5 great
6 love
7 this
8 always
9 best
10 your
11 have
12 their
13 here
14 they
15 staff
16 friendly
17 ve
18 amazing
19 can
20 if
Topic  3
1 to
2 me
3 they
4 my
5 you
6 she
7 he
8 do
9 that
10 have
11 car
12 them
13 told
14 her
15 get
16 if
17 your
18 up
19 call
20 when
Topic  4
1 was
2 the
3 it
4 not
5 but
6 chicken
7 me
8 that
9 like
10 of
11 good
12 my
13 sauce
14 tasted
15 ordered
16 buffet
17 just
18 meat
19 rice
20 sandwich
Topic  5
1 we
2 you
3 the
4 they
5 not
6 of
7 if
8 us
9 our
10 do
11 but
12 your
13 it
14 were
15 order
16 like
17 that
18 are
19 there
20 or


In [128]:
fileList = []
testDataList = []
for file in os.listdir("./test"):
    if ".txt" in file:
        f = open("./test/"+file)
        fileList.append(file)
        testDataList.append(f.read())
print("finished loading test data...")

finished loading test data...


In [129]:
testReviews = []
for review in testDataList: 
    doc = nlp(review)
    reviewTokenized = ""
    for token in doc:
        if token.pos_ != "PUNCT" and token.pos_ != "SPACE" and token.pos_ != "SYM":
            word = token.text.lower()
            reviewTokenized += " " + word
    testReviews.append(reviewTokenized[1:])

In [14]:
vectorizer = TfidfVectorizer()
testTFIDF = vectorizer.fit_transform(allReviews)
svd = TruncatedSVD(n_components=300, n_iter=7, random_state=42)
x_data = svd.fit_transform(testTFIDF)
test_x_nn = torch.from_numpy(x_data).float()

In [131]:
with torch.no_grad():
    outputs = svdModel(test_x_nn)
    _, predicted = torch.max(outputs, 1)
    y_pred = predicted.tolist()
result = zip(fileList,y_pred)

In [132]:
neg = ""
pos = ""
for tup in result:
    if tup[1] == 0:
        neg = neg + "\n" + tup[0] 
    else :
        pos = pos + "\n" + tup[0]

In [133]:
with open("pos.txt","w") as fp:
    fp.write(pos)
with open("neg.txt","w") as fn:
    fn.write(neg)