# Create the Bag of Words


In [1]:
# preprocessing
import string
from glob import glob

data_files = glob('./data/*_labelled.txt')
data = dict()

printable = set(string.printable)

def return_lines_from(file):
    with open(filename) as f:
        text = f.read()
    text = ''.join(list(filter(lambda x: x in printable, text)))
    return text.splitlines()

for filename in data_files:    
    data[filename] = return_lines_from(filename)


In [2]:
from cleaners.sentence_cleaner import SentenceCleaner
from collections import Counter

# Split into training and testing data
cleaner = SentenceCleaner()

testing = { 'data':[], 'labels': [] }
training = { 'data':[], 'labels': [] }

train_words = []

for fname in data.keys():
    positive = []
    negative = []
    for i in range(len(data[fname])):
        observation = data[fname][i]
        sentence, label = observation.split('\t')
        
        if int(label) == 1:
            positive.append(sentence)
        else:
            negative.append(sentence)
    
    training['data'] += positive[:400] + negative[:400]
    testing['data'] +=  positive[-100:] + negative[-100:] 
    
    training['labels'] += [1]*400+[0]*400
    testing['labels'] += [1]*100 +[0]*100


In [3]:
import numpy as np
import numpy.linalg as lg
from functools import reduce

training_cleaned = [Counter(cleaner.clean_sentence(sent)) for sent in training['data']]
training_words = reduce(lambda x, y: x+y,[list(obs) for obs in training_cleaned])
unique_words = list(set(training_words))
index_map = {word: unique_words.index(word) for word in unique_words}

X_train = np.zeros( [len(training['data']),len(unique_words)] )

for i in range(len(training_cleaned)):
    features = training_cleaned[i]
    for key, value in features.items():
        index = index_map[key]
        X_train[i][index] = value

print ("First Vector")
print (X_train[0]) #vector 1
print ("Second Vector")
print (X_train[1]) #vector 2

# Use the l-2 normalization
X_train = np.apply_along_axis(lambda x: x/lg.norm(x), 0, X_train)

First Vector
[ 0.  0.  0. ...,  0.  0.  0.]
Second Vector
[ 0.  0.  0. ...,  0.  0.  0.]


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

log_classifier = LogisticRegression()
log_classifier.fit(X_train, training['labels'])

bayes_classifier = BernoulliNB()
bayes_classifier.fit(X_train, training['labels'])

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [5]:
X_test = np.zeros( [len(testing['data']),len(unique_words)] )

cleaned_testing = [Counter(cleaner.clean_sentence(sent)) for sent in testing['data']]

for i in range(len(cleaned_testing)):    
    features = cleaned_testing[i]
    for key, value in features.items():
        if key in index_map.keys():
            index = index_map[key] 
            X_test[i][index] = value

print("Logistic Regression: ", log_classifier.score(X_test, testing['labels'])*100, '%')
print("Bayes Classifier: ", bayes_classifier.score(X_test, testing['labels'])*100, '%')

print("\nThe Bayes Classifier does slightly better!")

Logistic Regression:  80.0 %
Bayes Classifier:  80.8333333333 %

The Bayes Classifier does slightly better!


# Perform again with bigrams

In [6]:
bigrams = [Counter(cleaner.get_bigrams(review)) for review in training['data']]
unique_bigrams = list(set(reduce(lambda x, y: x+y, [list(bigram) for bigram in bigrams])))
index_map = {bigram: unique_bigrams.index(bigram) for bigram in unique_bigrams}

X_train2 = np.zeros( [len(bigrams),len(unique_bigrams)] )
for i in range(len(bigrams)):
    features = bigrams[i]
    for key, value in features.items():
        index = index_map[key]
        X_train2[i][index] = value

test_bigrams =[Counter(cleaner.get_bigrams(review)) for review in testing['data']]
X_test2 = np.zeros( [len(test_bigrams),len(unique_bigrams)] )

for i in range(len(test_bigrams)):    
    features = test_bigrams[i]
    for key, value in features.items():
        if key in index_map.keys():
            index = index_map[key] 
            X_test2[i][index] = value

In [7]:
log_classifier.fit(X_train2, training['labels'])
bayes_classifier.fit(X_train2, training['labels'])
print("Logistic Regression: ", log_classifier.score(X_test2, testing['labels'])*100, '%')
print("Bayes Classifier: ", bayes_classifier.score(X_test2, testing['labels'])*100, '%')

print("\nThe Bayes Classifier does slightly better!")

Logistic Regression:  73.8333333333 %
Bayes Classifier:  77.1666666667 %

The Bayes Classifier does slightly better!


# Perform PCA

In [26]:
import numpy as np
from numpy.linalg import svd, eigh

class PCA:
    def __init__(self, X):
        self.mean = X.mean(axis=0)
        self.centered = X - self.mean
        self.cov = np.cov(self.centered.T)
        self.eigenvalues, self.eigenvectors = self.get_sorted_eigenvectors()
    
    def approx(self, X2, rank=10):
        centered_new = X2 - self.mean
        return pca2.eigenvectors[:rank].dot(centered_new.T).T
    
    def get_sorted_eigenvectors(self):
        eigenvalues, eigenvectors = eigh(self.cov)
        order = eigenvalues.argsort()[::-1]   
        eigenvalues = eigenvalues[order]
        eigenvectors = eigenvectors[:,order]
        
        eigenvectors = eigenvectors/np.linalg.norm(eigenvectors, axis=0)
        return eigenvalues, eigenvectors

centered:  (2400, 3336)
cov:  (3336, 3336)
reconstruction:  (600, 10)


In [32]:
dimensions = [10, 50, 100, 500, 1000, 2000, 3000]

print("Bag of Words\n")
pca2 = PCA(X_train)
for n in dimensions:
    new_train = pca2.approx(X_train, rank=n)
    new_test = pca2.approx(X_test, rank=n)
    
    print("\n{}-order approximation\n".format(n))
    log_classifier.fit(new_train, training['labels'])
    bayes_classifier.fit(new_train, training['labels'])
    
    log_score = log_classifier.score(new_test, testing['labels'])*100
    print("Logistic Regression: ", log_score,'%')
    bayes_score = bayes_classifier.score(new_test, testing['labels'])*100
    print("Bayes Classifier: ", bayes_score, '%')

    
print("Bigrams\n")
pca2 = PCA(X_train2)
for n in dimensions:
    new_train = pca2.approx(X_train2, rank=n)
    new_test = pca2.approx(X_test2, rank=n)

    print("\n{}-order approximation\n".format(n))
    log_classifier.fit(new_train, training['labels'])
    bayes_classifier.fit(new_train, training['labels'])
    
    log_score = log_classifier.score(new_test, testing['labels'])*100
    print("Logistic Regression: ", log_score,'%')
    bayes_score = bayes_classifier.score(new_test, testing['labels'])*100
    print("Bayes Classifier: ", bayes_score, '%')

Bag of Words

centered:  (2400, 3336)
cov:  (3336, 3336)

10-order approximation

Logistic Regression:  49.5 %
Bayes Classifier:  48.3333333333 %

50-order approximation

Logistic Regression:  57.1666666667 %
Bayes Classifier:  58.0 %

100-order approximation

Logistic Regression:  61.5 %
Bayes Classifier:  65.5 %

500-order approximation

Logistic Regression:  70.1666666667 %
Bayes Classifier:  66.1666666667 %

1000-order approximation

Logistic Regression:  71.3333333333 %
Bayes Classifier:  67.5 %

2000-order approximation

Logistic Regression:  77.6666666667 %
Bayes Classifier:  69.5 %

3000-order approximation

Logistic Regression:  79.3333333333 %
Bayes Classifier:  69.6666666667 %
Bigrams

centered:  (2400, 17480)
cov:  (17480, 17480)

10-order approximation

Logistic Regression:  54.1666666667 %
Bayes Classifier:  50.6666666667 %

50-order approximation

Logistic Regression:  52.5 %
Bayes Classifier:  53.6666666667 %

100-order approximation

Logistic Regression:  54.1666666667