In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter
from os import listdir
from os.path import isfile, join
import tensorflow as tf

In [2]:
lemmatizer = WordNetLemmatizer()
hm_lines = 1000000

In [3]:
def create_lexicon(posloc, negloc):
    lexicon = []
    count_of_files = 0
    for fl in [posloc, negloc]:
        onlyfiles = [f for f in listdir(fl) if isfile(join(fl, f))]
        for filename in onlyfiles:
            count_of_files += 1
            with open(join(fl, filename)) as f:
                contents = f.readlines()
                for l in contents[:hm_lines]:
                    all_words = word_tokenize(l.lower())
                    lexicon += list(all_words)
    print('number of files read', count_of_files)
    lexicon = [lemmatizer.lemmatize(i.lower()) for i in lexicon]
    w_counts = Counter(lexicon)
    
    l2 = []
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
            l2.append(w.lower())
            
    return l2

In [4]:
def sample_handling(sample_list: list, lexicon, classification):
    featureset = []
    for sample in sample_list:
        print('Trying to read file', sample)
        with open(sample) as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                current_words = word_tokenize(l.lower())
                current_words = [lemmatizer.lemmatize(i) for i in current_words]
                features = np.zeros(len(lexicon))
                for word in current_words:
                    for word in lexicon:
                        index_value = lexicon.index(word.lower())
                        features[index_value] += 1
                features = list(features)
                featureset.append([features, classification])
    return featureset

In [5]:
def create_feature_sets_and_labels(posloc, negloc, test_size=0.1):
    lexicon = create_lexicon(posloc, negloc)
    posfiles = [join(posloc, f) for f in listdir(posloc) if isfile(join(posloc, f))]
    features = sample_handling(posfiles[:30], lexicon, [1, 0])
    print('len of features', len(features))
    negfiles = [join(negloc, f) for f in listdir(negloc) if isfile(join(negloc, f))]
    features += sample_handling(negfiles[:30], lexicon, [0, 1])
    print('len of features', len(features))
    random.shuffle(features)
    
    testing_size = int(test_size*len(features))
    features = np.array(features)
    train_x = list(features[:, 0][:-testing_size])
    train_y = list(features[:, 1][:-testing_size])
    
    
    test_x = list(features[:, 0][-testing_size:])
    test_y = list(features[:, 1][-testing_size:])
    
    return train_x, train_y, test_x, test_y

In [6]:
train_x, train_y, test_x, test_y = create_feature_sets_and_labels('txt_sentoken/pos/', 'txt_sentoken/neg/')

number of files read 2000
Trying to read file txt_sentoken/pos/cv839_21467.txt
Trying to read file txt_sentoken/pos/cv034_29647.txt
Trying to read file txt_sentoken/pos/cv908_16009.txt
Trying to read file txt_sentoken/pos/cv748_12786.txt
Trying to read file txt_sentoken/pos/cv253_10077.txt
Trying to read file txt_sentoken/pos/cv147_21193.txt
Trying to read file txt_sentoken/pos/cv962_9803.txt
Trying to read file txt_sentoken/pos/cv686_13900.txt
Trying to read file txt_sentoken/pos/cv410_24266.txt
Trying to read file txt_sentoken/pos/cv913_29252.txt
Trying to read file txt_sentoken/pos/cv695_21108.txt
Trying to read file txt_sentoken/pos/cv601_23453.txt
Trying to read file txt_sentoken/pos/cv490_17872.txt
Trying to read file txt_sentoken/pos/cv518_13331.txt
Trying to read file txt_sentoken/pos/cv157_29372.txt
Trying to read file txt_sentoken/pos/cv570_29082.txt
Trying to read file txt_sentoken/pos/cv289_6463.txt
Trying to read file txt_sentoken/pos/cv098_15435.txt
Trying to read file tx

In [7]:
len(train_x[0])

2316

In [8]:
n_nodes_hl1 = 500
n_nodes_hl2 = 500
n_nodes_hl3 = 500

n_classes = 2
batch_size = 100

x = tf.placeholder('float', [None, len(train_x[0])])
y = tf.placeholder('float')

In [9]:
def neural_network(data):
    hidden_1_layer = {'weights': tf.Variable(tf.random_normal([len(train_x[0]), n_nodes_hl1])), 
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl1]))} 
    hidden_2_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2 ])), 
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl2]))}
    hidden_3_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])), 
                      'biases': tf.Variable(tf.random_normal([n_nodes_hl3]))}
    output_layer = {'weights': tf.Variable(tf.random_normal([n_nodes_hl3, n_classes])), 
                    'biases': tf.Variable(tf.random_normal([n_classes]))}
    
    layer1 = tf.add(tf.matmul(data, hidden_1_layer['weights']), hidden_1_layer['biases'])
    layer1 = tf.nn.relu(layer1)
    
    layer2 = tf.add(tf.matmul(layer1, hidden_2_layer['weights']), hidden_2_layer['biases'])
    layer2 = tf.nn.relu(layer2)
    
    layer3 = tf.add(tf.matmul(layer2, hidden_3_layer['weights']), hidden_3_layer['biases'])
    layer3 = tf.nn.relu(layer3)
    
    output = tf.matmul(layer3, output_layer['weights']) + output_layer['biases']
    
    return output

In [12]:
def train_neural_network(x):
    prediction = neural_network(x)
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=prediction, labels=y))
    
    optimizer = tf.train.AdamOptimizer().minimize(cost)
    
    how_many_epochs = 100
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for epoch in range(how_many_epochs):
            epoch_loss = 0
            i = 0
            while i < len(train_x):
                start = i
                end = i + batch_size
                
                batch_x = np.array(train_x[start:end])
                batch_y = np.array(train_y[start:end])
                
                _, c = sess.run([optimizer, cost], feed_dict= {x: batch_x, y: batch_y})
            
                epoch_loss += c
                i = end
            
            if epoch % 5 == 0:
                print('Epoch', epoch, 'completed out of', how_many_epochs, 'loss', epoch_loss)
            
        correct = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
        
        accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
        print('Accuracy', accuracy.eval({x: test_x, y: test_y}))

In [13]:
train_neural_network(x)

Epoch 0 completed out of 100 loss 15944706.52734375
Epoch 5 completed out of 100 loss 4950909.814453125
Epoch 10 completed out of 100 loss 4302392.155761719
Epoch 15 completed out of 100 loss 4318885.634521484
Epoch 20 completed out of 100 loss 8332908.4609375
Epoch 25 completed out of 100 loss 2356580.990234375
Epoch 30 completed out of 100 loss 3251262.5595703125
Epoch 35 completed out of 100 loss 2304494.6352539062
Epoch 40 completed out of 100 loss 1665516.2199707031
Epoch 45 completed out of 100 loss 1684218.154296875
Epoch 50 completed out of 100 loss 1462201.861328125
Epoch 55 completed out of 100 loss 1403230.7802734375
Epoch 60 completed out of 100 loss 1501104.6137695312
Epoch 65 completed out of 100 loss 1811004.75390625
Epoch 70 completed out of 100 loss 2315435.5290527344
Epoch 75 completed out of 100 loss 918950.328125
Epoch 80 completed out of 100 loss 2490603.805908203
Epoch 85 completed out of 100 loss 3452183.6724243164
Epoch 90 completed out of 100 loss 2596698.22192