In [1]:
import tensorflow as tf
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter

In [2]:
#initializing the lemmatizer object
lemmatizer= WordNetLemmatizer()
hm_lines= 10000

In [3]:
#function to create meaningfull lexicon from a huge text
def create_lexicon(pos,neg):
    #placeholder to hold the lemmantize lexicon
    lexicon=[]
    for fi in [pos,neg]:
        #opening each pos.txt and neg.tex with each iteration
        with open(fi,'r') as f:
            #contents contain an array of sentences
            contents= f.readlines()
            #here l will read one sentence at one iteration and will read upto 10000 sentence
            for l in contents[:hm_lines]:
                #converting each sentence into tokens. 'all_words' is a list of words in lowercase
                all_words= word_tokenize(l.lower())
                #merging several list in one list
                lexicon += list(all_words)
    
    #lemmatizing ie making running, ran etc into root like run
    lexicon= [lemmatizer.lemmatize(i) for i in lexicon]
    #w_count will count the occurence of each unique word in the form '({'hello':2107,'hi':2136})'
    w_count= Counter(lexicon)
    l2=[]
    #here 'w' will contain each unique word only and w_count[w] will give the number of occurence only
    for w in w_count:
        #the idea is to select those important word which occurs less
        if 1000>w_count[w]>50:
            l2.append(w)
    return l2
#end of function
#l2 will contain list of all words ['all','the','``']

In [4]:
#
def sample_handling(sample,lexicon,classification):
    #
    featureset=[]
    #reading 'sample' file and tokenizing and lemmatizing it
    with open(sample,'r') as f:
        contents= f.readlines()
        for l in contents[:hm_lines]:
            current_word= word_tokenize(l.lower())
            current_word= [lemmatizer.lemmatize(i) for i in current_word]
            #here each iteration of current word will contain a sentence which is tokenize and lemmatize
            #i.e ['I','am','a','boy','.']
            #features will contain array of zeros eg. [0,0,0,0,0,0,0]
            #the size of the feature is 115404
            features= np.zeros(len(lexicon))
            #we are taking each word of 'current_word' and
            for word in current_word:
                #if the word is in lexicon then
                if word.lower() in lexicon:
                    #we find the index of the word in lexicon
                    index_value= lexicon.index(word.lower())
                    #and incrementing the value at that position in feature eg [0,0,1,0,0]. This is because the
                    #length of feature set is same as lexicon
                    features[index_value]+=1
            features= list(features)
            featureset.append([features,classification])
            #featureset will look like [[features,classification],[feat..,class..],[[000100],[01]]] etc
    return featureset
#end of function


In [5]:
def create_feature_sets_and_labels(pos,neg,test_size = 0.1):
        lexicon = create_lexicon(pos,neg)
        features = []
        #feature will be like [[[10001],[10]],[[100010],[01]],......]
        features += sample_handling(pos,lexicon,[1,0])
        features += sample_handling(neg,lexicon,[0,1])
        random.shuffle(features)
        #converting list to array object
        features = np.array(features)

        testing_size = int(test_size*len(features))
        #train_x will have [[10001],[001011],[1001010],.....]
        train_x = list(features[:,0][:-testing_size])
        #train_y will contain [[10],[01],[10]........]
        train_y = list(features[:,1][:-testing_size])
        test_x = list(features[:,0][-testing_size:])
        test_y = list(features[:,1][-testing_size:])

        return train_x,train_y,test_x,test_y

In [6]:
train_x,train_y,test_x,test_y = create_feature_sets_and_labels('pos.txt','neg.txt')
#with open('sentiment_set.pickle','wb') as f:
#    pickle.dump([train_x,train_y,test_x,test_y],f)

In [7]:

#end of preprocessing



In [8]:
#neural net begins

In [12]:
#no of nodes in layers
n_nodes_hl1 = 1500
n_nodes_hl2 = 1500
n_nodes_hl3 = 1500

#binary classification ie positive [1,0] or negetive ie [0,1]
n_classes = 2
batch_size = 100
hm_epochs = 20

#usage of placeholder
#'placeholder(dtype,shape,name)'. To evaaluate this it must be fed data eg
#x = tf.placeholder(tf.float32, shape=(1024, 1024))
#y = tf.matmul(x, x)
#rand_array = np.random.rand(1024, 1024) 
#print(sess.run(y, feed_dict={x: rand_array}))
#end
x = tf.placeholder('float')
y = tf.placeholder('float')

#defining a map whose values will be used in each layer of neuralnet

#tf.random_normal(shape,mean=0.0,stdev=1.0) outputs random variable from normal distribution
hidden_1_layer = {'f_fum':n_nodes_hl1,
                  'weight':tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hl1])),
                  'bias':tf.Variable(tf.random_normal([n_nodes_hl1]))}

hidden_2_layer = {'f_fum':n_nodes_hl2,
                  'weight':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
                  'bias':tf.Variable(tf.random_normal([n_nodes_hl2]))}

hidden_3_layer = {'f_fum':n_nodes_hl3,
                  'weight':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
                  'bias':tf.Variable(tf.random_normal([n_nodes_hl3]))}

output_layer = {'f_fum':None,
                'weight':tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])),
                'bias':tf.Variable(tf.random_normal([n_classes]))}

#defining the neural net model
def neural_network_model(data):
    #first layer
    #matrix multiplication of data with weights and then addition of bias
    #we add bias because we want our neural network to always fire
    l1 = tf.add(tf.matmul(data,hidden_1_layer['weight']), hidden_1_layer['bias'])
    #passing the summation through the activation function
    #it is a rectified linear function. 'relu(features which is a tensor,name)'
    l1 = tf.nn.relu(l1)
    
    #Layer 2
    l2 = tf.add(tf.matmul(l1,hidden_2_layer['weight']), hidden_2_layer['bias'])
    l2 = tf.nn.relu(l2)
    
    #layer 3
    l3 = tf.add(tf.matmul(l2,hidden_3_layer['weight']), hidden_3_layer['bias'])
    l3 = tf.nn.relu(l3)

    output = tf.matmul(l3,output_layer['weight']) + output_layer['bias']

    return output
#end of neural net model

In [10]:
#training our neural net
def train_neural_network(x):
        prediction = neural_network_model(x)
        cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=y) )
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

        with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                
                for epoch in range(hm_epochs):
                        epoch_loss = 0
                        i=0
                        while i < len(train_x):
                                start = i
                                end = i+batch_size
                                batch_x = np.array(train_x[start:end])
                                batch_y = np.array(train_y[start:end])

                                _, c = sess.run([optimizer, cost], feed_dict={x: batch_x,y: batch_y})
                                epoch_loss += c
                                i+=batch_size
                                
                        print('Epoch', epoch+1, 'completed out of',hm_epochs,'loss:',epoch_loss)
                correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
                accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

                print('Accuracy:',accuracy.eval({x:test_x, y:test_y})*100,'%')


In [11]:
train_neural_network(x)

Epoch 1 completed out of 20 loss: 1466141.18311
Epoch 2 completed out of 20 loss: 500205.926025
Epoch 3 completed out of 20 loss: 236399.635376
Epoch 4 completed out of 20 loss: 233396.099854
Epoch 5 completed out of 20 loss: 333945.280121
Epoch 6 completed out of 20 loss: 164637.504208
Epoch 7 completed out of 20 loss: 91485.6360321
Epoch 8 completed out of 20 loss: 111690.989552
Epoch 9 completed out of 20 loss: 108725.947052
Epoch 10 completed out of 20 loss: 115102.818727
Epoch 11 completed out of 20 loss: 197805.220808
Epoch 12 completed out of 20 loss: 202860.90612
Epoch 13 completed out of 20 loss: 19275.609046
Epoch 14 completed out of 20 loss: 8761.91754064
Epoch 15 completed out of 20 loss: 8056.98958445
Epoch 16 completed out of 20 loss: 6712.42345309
Epoch 17 completed out of 20 loss: 6775.75487953
Epoch 18 completed out of 20 loss: 7841.22249734
Epoch 19 completed out of 20 loss: 8107.0376575
Epoch 20 completed out of 20 loss: 7311.37510335
Accuracy: 64.0712916851 %
