### Create lexicon, featureset, and labels

In [7]:
# create_sentiment_featuresets for use in nn_tf_1 file

import nltk
# download nltk if not already downloaded
# nltk.download()
# work_tokenize separates words
from nltk.tokenize import word_tokenize 
# convert similar words eg tenses to the "same" object
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter

lemmatizer = WordNetLemmatizer()
hm_lines = 10000000  # using less data could help with MemoryErrors

def create_lexicon(pos, neg):
    lexicon = []
    
    # take in a +ve example file and a -ve example file
    # for each file, open it, read contents by line, tokenise every word for the line 
    # add tokenised list to lexicon as one entry in the entire lexicon list
    for fi in [pos, neg]:
        with open(fi, 'r') as f:
            contents = f.readlines()
            for l in contents[:hm_lines]:
                # word_tokenize distinguishes between cases
                all_words = word_tokenize(l.lower())
                lexicon += list(all_words)
    
    # grouping together different forms of the word https://www.geeksforgeeks.org/python-lemmatization-with-nltk/
    lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
    # create a dictionary like this w_counts = {'the':597, 'and':293}
    w_counts = Counter(lexicon)
        
    l2 = []
    # want to make the lexicon efficient, by discarding most common words, and least common words in the text
    for w in w_counts:
        if 1000 > w_counts[w] > 50:
           l2.append(w)
    
    return l2

def sample_handling(sample, lexicon, classification):
    featureset = []
    
    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            
            # create zeros array as long as the number of lines in lexicon
            # if the word can be found, add one to the word's index in the lexicon
            features = np.zeros(len(lexicon))
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])
        
    return featureset

def create_feature_sets_and_labels(pos, neg, test_size=0.1):
    lexicon = create_lexicon(pos, neg)
    features = []
    features += sample_handling(pos, lexicon, [1,0])
    features += sample_handling(neg, lexicon, [0,1])
    random.shuffle(features)
    
    # features now looks like this: [[[0 1 0 1 1 0], [0, 1]], [features, label]]
    features = np.array(features, dtype=object)  # np requires dtype specification now
    
    testing_size = int(test_size * len(features))
    # take the feature set, and not the labels with [:,0]
    # take labels and not feature set with [:,1]
    train_x = list(features[:,0][:-testing_size])
    train_y = list(features[:,1][:-testing_size])
    test_x = list(features[:,0][-testing_size:])
    test_y = list(features[:,1][-testing_size:])
    
    return train_x, train_y, test_x, test_y
    
if __name__ == '__main__':
    train_x, train_y, test_x, test_y = create_feature_sets_and_labels('pos.txt', 'neg.txt')
    with open('sentiment_set.pickle', 'wb') as f:
        pickle.dump([train_x, train_y, test_x, test_y], f)