In [1]:
# preprocessing
import string
from glob import glob

data_files = glob('./data/*_labelled.txt')
data = dict()

printable = set(string.printable)

def return_lines_from(file):
    with open(filename) as f:
        text = f.read()
    text = ''.join(list(filter(lambda x: x in printable, text)))
    return text.splitlines()

for filename in data_files:    
    data[filename] = return_lines_from(filename)


In [20]:
from cleaners.sentence_cleaner import SentenceCleaner
from collections import Counter

# Split into training and testing data
cleaner = SentenceCleaner()

testing = { 'data':[], 'labels': [] }
training = { 'data':[], 'labels': [] }

all_words = []
train_words = []

for fname in data.keys():
    positive = []
    negative = []
    for i in range(len(data[fname])):
        observation = data[fname][i]
        sentence, label = observation.split('\t')
        
        if int(label) == 1:
            positive.append(sentence)
        else:
            negative.append(sentence)
        
        all_words += cleaned
    
    train = positive[:400] + negative[:400]
    training['data'] += [Counter(cleaner.clean_sentence(sent)) for sent in train]
    
    test = positive[-100:] + negative[-100:]
    testing['data'] += [Counter(cleaner.clean_sentence(sent)) for sent in test]
    
    training['labels'] += [1]*400+[0]*400
    testing['labels'] += [1]*100 +[0]*100


In [21]:
import numpy as np
import numpy.linalg as lg

training_words = []
for observation in training['data']:
    training_words += observation
    
unique_words = list(set(training_words))
index_map = {word: unique_words.index(word) for word in unique_words}
X_train = np.zeros( [len(training['data']),len(unique_words)] )

for i in range(len(training['data'])):
    features = training['data'][i]
    for key, value in features.items():
        index = index_map[key]
        X_train[i][index] = value

print ("First Vector")
print (X_train[0]) #vector 1
print ("Second Vector")
print (X_train[1]) #vector 2

# Use the l-2 normalization
X_train = np.apply_along_axis(lambda x: x/lg.norm(x), 0, X_train)

First Vector
[ 0.  0.  0. ...,  0.  0.  0.]
Second Vector
[ 0.  0.  0. ...,  0.  0.  0.]


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

log_classifier = LogisticRegression()
log_classifier.fit(X_train, training['labels'])

bayes_classifier = BernoulliNB()
bayes_classifier.fit(X_train, training['labels'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [25]:
X_test = np.zeros( [len(testing['data']),len(unique_words)] )

for i in range(len(testing['data'])):
    features = testing['data'][i]
    for key, value in features.items():
        if key in index_map.keys():
            index = index_map[key] 
            X_test[i][index] = value

print("Logistic Regression: ", log_classifier.score(X_test, testing['labels'])*100)
print("Bayes Classifier: ", bayes_classifier.score(X_test, testing['labels'])*100)

print("\nThe Bayes Classifier does slightly better!")

Logistic Regression:  80.0
Bayes Classifier:  80.8333333333

The Bayes Classifier does slightly better!
