In [0]:
import json
import numpy as np
import sys
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
# np.set_printoptions(threshold=sys.maxsize)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# A: Dataset Analysis
Download Sentiment Labelled Sentences Data Set. There are three data files under the root
folder. yelp_labelled.txt, amazon_cells_labelled.txt and imdb_labelled.txt. Parse each file
with the specifications in readme.txt. Are the labels balanced? If not, what’s the ratio between
the two labels? Explain how you process these files.


In [44]:
path_prefix = 'drive/My Drive/CS 5785/HW3/sentiment labelled sentences'
data_sets = ["amazon_cells_labelled.txt", "imdb_labelled.txt", "yelp_labelled.txt"]

def parse_data_set(filename):
    positives = []
    negatives = []
    print("Parsing dataset: {}".format(filename))
    with open('{}/{}'.format(path_prefix, filename)) as f:
        count = 0
        pos_count = 0
        neg_count = 0
        lines = f.readlines()
        for line in lines:
            data = line.strip().split('\t')
            value = data[1]
            review = data[0]
            if value == '1':
                positives.append((value, review))
                pos_count += 1
            else:
                negatives.append((value, review))
                neg_count += 1
            count = count + 1
        print("Total lines: {}".format(count))

    print("Positive count: {}".format(len(positives)))
    print("Negative count: {}".format(len(negatives)))
    return positives, negatives

amazon_positives, amazon_negatives = parse_data_set("amazon_cells_labelled.txt")
imdb_positives, imdb_negatives = parse_data_set("imdb_labelled.txt")
yelp_positives, yelp_negatives = parse_data_set("yelp_labelled.txt")

Parsing dataset: amazon_cells_labelled.txt
Total lines: 1000
Positive count: 500
Negative count: 500
Parsing dataset: imdb_labelled.txt
Total lines: 1000
Positive count: 500
Negative count: 500
Parsing dataset: yelp_labelled.txt
Total lines: 1000
Positive count: 500
Negative count: 500


# B: Preprocessing Strategy
Lowercase all words: All caps can be indicative for both directions. Lowercase improved accuracy.

Lemmatization (running, runs, run all same): NO -> Made accuracy worse
**TODO:** Find better stemmer

Strip punctuation: Kinda, need exclamation marks and such, but periods can be stripped. Punctuation should be a separate bucket in the feature vector and not part of the word it's next to.  Yeah, would probaby be better off stripping punctuation because there are many malformed sentences that hurt BoW word counts
Yes, ended up stripping punctuation

Strip stop words: Yes, did this.

Initially, started out without pre processing. Then, stripped punctuation, and stop words.  Initially had lemmatiziation, but accuracy was worse.  Then, lowercased all words, which helped as well.


In [45]:
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')

# Wordnet stemmer
nltk.download("wordnet")
wnl = WordNetLemmatizer()
l = nltk.stem.snowball.EnglishStemmer()

# Lemmatization of the data using NLTK English Snowball Stemmer
def lemmatize(w):
    sentence = w.split(' ')
    new_sentence = []
    for s in sentence:
        tmp = wnl.lemmatize(s)
        new_sentence.append(l.stem(tmp))
    return " ".join(new_sentence)

def remove_stop_words(s):
    word_tokens = word_tokenize(s)
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return " ".join(filtered_sentence)

# Remove punctuation
def remove_punctuation(s):
    # print("Pre punctuation removal: {}".format(s[1]))
    # print(s[1])
    clean = re.sub(r"[,.;@#?!&$]+\ *", " ", s)
    # print("Post punctuation removal: {}".format(clean))
    return clean

# Almost need to replace punctuation with space b/c of bad typing also
s = 'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!! Can\'t stop won\'t stop!'
remove_punctuation(amazon_negatives[1][1])
print(remove_punctuation("Then, as if i hadn't wasted enough of my time, they"))

print(remove_stop_words("testing this as a stop word remover that should take out things like The"))

print(lemmatize("Running with this"))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Then as if i hadn't wasted enough of my time they
testing stop word remover take things like The
run with this


In [0]:
# Perform the pre processing on the dataset
all_data = amazon_negatives + amazon_positives + imdb_negatives + imdb_positives + yelp_negatives + yelp_positives

# Remove punctuation
# print(all_data[1])
# list(map(remove_punctuation, all_data))
# Amazon pre processing
for idx, d in enumerate(amazon_negatives):
    w = d[1]
    # print(w)
    w = w.lower()
    # print(w)
    w = remove_punctuation(w)
    # print(w)
    w = remove_stop_words(w)
    # print(w)
    w = lemmatize(w)
    # print(w)
    amazon_negatives[idx] = (amazon_negatives[idx][0], w)
for idx, d in enumerate(amazon_positives):
    w = d[1]
    # print(w)
    w = remove_punctuation(w)
    # print(w)
    w = remove_stop_words(w)
    # print(w)
    w = lemmatize(w)
    # print(w)
    # w = w.lower()
    amazon_positives[idx] = (amazon_positives[idx][0], w)


# IMDB Preprocessing
for idx, d in enumerate(imdb_negatives):
    w = d[1]
    # print(w)
    w = remove_punctuation(w)
    # print(w)
    w = remove_stop_words(w)
    # print(w)
    w = lemmatize(w)
    # print(w)
    # w = w.lower()
    imdb_negatives[idx] = (imdb_negatives[idx][0], w)
for idx, d in enumerate(imdb_positives):
    w = d[1]
    # print(w)
    w = remove_punctuation(w)
    # print(w)
    w = remove_stop_words(w)
    # print(w)
    w = lemmatize(w)
    # print(w)
    # w = w.lower()
    imdb_positives[idx] = (imdb_positives[idx][0], w)


# Yelp preprocessing
for idx, d in enumerate(yelp_negatives):
    w = d[1]
    # print(w)
    w = remove_punctuation(w)
    # print(w)
    w = remove_stop_words(w)
    # print(w)
    w = lemmatize(w)
    # print(w)
    # w = w.lower()
    yelp_negatives[idx] = (yelp_negatives[idx][0], w)
for idx, d in enumerate(yelp_positives):
    w = d[1]
    # print(w)
    w = remove_punctuation(w)
    # print(w)
    w = remove_stop_words(w)
    # print(w)
    w = lemmatize(w)
    # print(w)
    # w = w.lower()
    yelp_positives[idx] = (yelp_positives[idx][0], w)


# C: Separate Training and Testing Data
In this assignment, for each file, please use the first 800 instances for each label as the training set and the remaining 200 instances as testing set. In
total, there are 2400 reviews for training and 600 reviews for testing.

In [47]:
train = amazon_positives[:400] + amazon_negatives[:400] + imdb_positives[:400] + imdb_negatives[:400] + yelp_positives[:400] + yelp_negatives[:400]
print(len(train))
test = amazon_positives[400:500] + amazon_negatives[400:500] + imdb_positives[400:500] + imdb_negatives[400:500] + yelp_positives[400:500] + yelp_negatives[400:500]
print(len(test))

# train = positives[:400] + negatives[:400]
# test = positives[400:500] + negatives[400:500]
X_train = [x[1] for x in train]
Y_train = [int(x[0]) for x in train]
X_test = [x[1] for x in test]
Y_test = [int(x[0]) for x in test]
print(X_train[0])
print(Y_train[0])
print(X_test[0])
print(Y_test[0])

2400
600
good case excel valu
1
this great deal
1


# D: Bag of words model
Extract features and then represent each review using bag of words
model, i.e., every word in the review becomes its own element in a feature vector. In order to
do this, first, make one pass through all the reviews in the training set (Explain why we can’t
use testing set at this point) and build a dictionary of unique words. Then, make another pass
through the review in both the training set and testing set and count up the occurrences of
each word in your dictionary. The ith element of a review’s feature vector is the number of
occurrences of the ith dictionary word in the review. Implement the bag of words model and
report feature vectors of any two reviews in the training set.

In [48]:
# Extract features and represent as bag of words model
# TODO: Make this into a function to use for N-Gram part 

# Dictionary of unique words in our training set
all_words = {}
for r in X_train:
    # Split up words in this review
    words = r.split(' ')
    # Initialize feature vector dict with just the words and 0 count
    for w in words:
        if w not in all_words:
            all_words[w] = 0

unique_words_list = list(all_words.keys())
print("Unique words({}): {}".format(len(unique_words_list), unique_words_list))

test = ["this phone is a good phone"]
# Create a feature matrix of all review's bag of words feature vectors
feature_matrix = []
all_reviews = X_train + X_test
# all_reviews = test
for r in all_reviews:
    # print("Looking at review: {}".format(r))
    split_review = r.split(' ')
    feature_vector = [0] * len(unique_words_list)
    for s in split_review:
        if s in unique_words_list:
            idx = unique_words_list.index(s)
            feature_vector[idx] += 1
    feature_matrix.append(feature_vector)

print(len(feature_matrix))
print(all_reviews[-1])
print(feature_matrix[-1])
print(all_reviews[17])
print(feature_matrix[17])
# print(feature_matrix[17][unique_words_list.index("situations:1.)")])

# Save a couple feature vectors to output file for submission
# np.savetxt("feature_vectors.csv", y_pred, delimiter(","))
values = [17, -1]
with open('feature_vectors.txt', 'w') as f:
    for i in values:
        f.write("{}\n".format(all_reviews[i]))
        print("Writing sentence: {}".format(all_reviews[i]))
        for idx, item in enumerate(feature_matrix[i]):
            f.write("{}, ".format(item))
        f.write("\n")

Unique words(3644): ['good', 'case', 'excel', 'valu', 'great', 'jawbon', 'the', 'mic', 'if', 'razr', 'owner', 'must', 'and', 'sound', 'qualiti', 'he', 'impress', 'go', 'origin', 'batteri', 'extend', 'veri', 'though', 'high', 'recommend', 'one', 'blue', 'tooth', 'phone', 'so', 'far', 'work', 'i', 'bought', 'use', 'kindl', 'fire', 'absolut', 'love', 'yet', 'run', 'new', 'two', 'bar', "'s", 'three', 'day', 'without', 'charg', 'pocket', 'pc', '/', 'combin', 've', 'own', '7', 'month', 'say', 'best', 'mobil', 'this', 'product', 'ideal', 'peopl', 'like', 'whose', 'ear', 'sensit', 'car', 'charger', 'well', 'ac', 'includ', 'make', 'sure', 'never', 'juic', 'highi', 'it', 'kept', 'fine', '680', 'camera', 'that', '2mp', 'pic', 'nice', 'clear', 'pictur', 'headset', 'price', 'right', 'bluetooth', 'featur', 'want', 'seem', 'made', 'protect', 'bulki', 'a', 'usabl', 'keyboard', 'actual', 'turn', 'pda', 'real-world', 'machin', 'instead', 'neat', 'gadget', 'pretti', 'sturdi', 'larg', 'problem', 'thing', 

# E: Post Processing
Since the vast majority of English words will not appear in
most of the reviews, most of the feature vector elements will be 0. This suggests that we need
a postprocessing or normalization strategy that combats the huge variance of the elements
in the feature vector. You may want to use one of the following strategies. Whatever choices
you make, explain why you made the decision.

• log-normalization f (x) =log (x +1).

• l1 normalization. x =x / | x |

• l2 normalization. x = x / ||x||

• Standardize the data by subtracting the mean and dividing by the variance.



In [0]:
# Log normalize the data
# feature_matrix = np.log(np.array(feature_matrix), where=feature_matrix>0)

from math import log

def log_norm(feature_matrix):
    feature_matrix_post = []
    for idx, feature in enumerate(feature_matrix):
        # print(feature)
        for jdx, f in enumerate(feature):
            l = log(1+f)
            # print(l)
            feature[jdx] = l
        # feature_matrix[idx] = feature
        feature_matrix_post.append(feature)
    return feature_matrix_post

feature_matrix_post = log_norm(feature_matrix)

# F: Sentiment Prediction
Train a logistic regression model (you can use existing packages here)
on the training set and test on the testing set. Report the classification accuracy and confusion matrix. Inspecting the weight vector of the logistic regression, what are the words that
play the most important roles in deciding the sentiment of the reviews? Repeat this with a
Naive Bayes classifier and compare performance.

In [50]:
# Logistic Regression prediction
lr_bow = LogisticRegression()

# Convert to np arrays
# print(X_train)
X_train_bow = np.array(feature_matrix_post[:2400]) # First 2400 values of feature matrix are training data (positive and negative)
lr_bow.fit(X_train_bow, Y_train) # Original Y_train is already set

X_test_bow = np.array(feature_matrix_post[2400:])
print(len(X_test_bow))

y_pred = lr_bow.predict(X_test_bow)
y_score = lr_bow.score(X_test_bow, Y_test)
# print(y_pred)
print(y_score)
# Generate confusion matrix from logistic regression
c_matrix = confusion_matrix(Y_test, y_pred)
print(c_matrix)

# Gaussian Naive Bayes 
gnb = GaussianNB()
# y_pred = gnb.fit(X_train_phrases, Y_train).predict(X_train_phrases)
gnb.fit(X_train_bow, Y_train)
gnb_score = gnb.score(X_test_bow, Y_test)
print("Gaussian Naive Bayes Score: {}".format(gnb_score))

# Bernoulli Naive Bayes
bnb = BernoulliNB()
# y_pred = bnb.fit(X_train_phrases, Y_train).predict(X_train_phrases)
bnb.fit(X_train_bow, Y_train)
bnb_score = bnb.score(X_test_bow, Y_test)
bnb_pred = bnb.predict(X_test_bow)
bnb_matrix = confusion_matrix(Y_test, bnb_pred)
print(bnb_matrix)
print("Bernoulli Naive Bayes Score: {}".format(bnb_score))

# Multinomial Naive Bayes
mnb = MultinomialNB()
mnb.fit(X_train_bow, Y_train)
mnb_score = mnb.score(X_test_bow, Y_test)
mnb_pred = mnb.predict(X_test_bow)
print("Multinomial Naive Bayes Score: {}".format(mnb_score))
mnb_matrix = confusion_matrix(Y_test, mnb_pred)
print(mnb_matrix)

print(mnb.coef_[0])

print(bnb.coef_[0])



600
0.8233333333333334
[[262  38]
 [ 68 232]]
Gaussian Naive Bayes Score: 0.65
[[257  43]
 [ 65 235]]
Bernoulli Naive Bayes Score: 0.82
Multinomial Naive Bayes Score: 0.8133333333333334
[[249  51]
 [ 61 239]]
[-4.61059354 -6.63127927 -5.72949555 ... -9.17943471 -9.17943471
 -9.17943471]
[-2.19390232 -4.20137036 -3.30755248 ... -7.09174212 -7.09174212
 -7.09174212]


# G: N-Gram Model
N-gram model. Similar to the bag of words model, but now you build up a dictionary of ngrams, which are contiguous sequences of words. For example, “Alice fell down the rabbit
hole” would then map to the 2-grams sequence: ["Alice fell", "fell down", "down the", "the
rabbit", "rabbit hole"], and all five of those symbols would be members of the n-gram dictionary. Try n = 2, repeat (d)-(g) and report your results.

In [51]:
# Create N-gram model from sentence string

s = "Alice fell down the rabbit hole twice"
n = 2

def generate_n_gram(s, n):
    split = s.split()
    n_gram = [None] * (len(split) - 1)
    for i in range(len(n_gram)):
        n_gram[i] = " ".join(split[i:i+2])

    return n_gram
    # print(n_gram)

generate_n_gram(s, n)

['Alice fell',
 'fell down',
 'down the',
 'the rabbit',
 'rabbit hole',
 'hole twice']

# Bag of Phrases Model


In [52]:
# Gram size of 2
n = 2

# Generate a dictionary of grams from training set
all_grams = {}
for r in X_train:
    # Generate an n gram
    phrases = generate_n_gram(r, n)
    # print(phrases)
    # Initialize feature vector dict with just the phrases and 0 count
    for p in phrases:
        if p not in all_grams:
            all_grams[p] = 0

unique_phrases_list = list(all_grams.keys())
# print("Unique phrases({}): {}".format(len(unique_phrases_list), unique_phrases_list))
# unique_words_list = list(all_words.keys())
# print("Unique words({}): {}".format(len(unique_words_list), unique_words_list))

# Generate Bag of Phrases
test = ["this phone is a good phone"]
# Create a feature matrix of all review's bag of words feature vectors
feature_matrix_phrases = []
all_reviews = X_train + X_test
# all_reviews = test
for r in all_reviews:
    # print("Looking at review: {}".format(r))
    review_phrases = generate_n_gram(r, n)
    feature_vector_phrases = [0] * len(unique_phrases_list)
    for p in review_phrases:
        if p in unique_phrases_list:
            # print("Phrase '{}' found in unique list".format(p))
            idx = unique_phrases_list.index(p)
            feature_vector_phrases[idx] += 1
    # print(feature_vector_phrases)
    feature_matrix_phrases.append(feature_vector_phrases)

print(len(feature_matrix_phrases))
print(len(feature_matrix_phrases[0]))


3000
12663


# Bag of Phrases Post Processing

In [0]:

#****************************
# TODO: Post Processing
#****************************

feature_matrix_phrases_post = log_norm(feature_matrix_phrases)

# Bag of Phrases Sentiment Prediction

In [54]:
#***************************************
# TODO: Sentiment Prediction for Phrases
#***************************************
# Logistic Regression prediction
lr_phrases = LogisticRegression(solver="lbfgs")

# Convert to np arrays
X_train_phrases = np.array(feature_matrix_phrases_post[:2400]) # First 2400 values of feature matrix are training data (positive and negative)
lr_phrases.fit(X_train_phrases, Y_train) # Original Y_train is already set
print(X_train_phrases.shape)

X_test_phrases = np.array(feature_matrix_phrases_post[2400:])
print(len(X_test_phrases))

y_pred = lr_phrases.predict(X_test_phrases)
y_score = lr_phrases.score(X_test_phrases, Y_test)
# np.savetxt("whats_cooking_results.csv", y_pred, delimiter(","))

lr_phrases_matrix = confusion_matrix(Y_test, y_pred)
print(lr_phrases_matrix)
print(y_score)

# Bernoulli Naive Bayes
bnb = BernoulliNB()
# y_pred = bnb.fit(X_train_phrases, Y_train).predict(X_train_phrases)
bnb.fit(X_train_phrases, Y_train)
bnb_score = bnb.score(X_test_phrases, Y_test)
bnb_pred_phrases = bnb.predict(X_test_phrases)
bnb_phrases_matrix = confusion_matrix(Y_test, bnb_pred_phrases)
print("Bernoulli Naive Bayes Score: {}".format(bnb_score))
print(bnb_phrases_matrix)
# print(cross_val_score(bnb, X_train_phrases, Y_train, cv=3))

# Multinomial Naive Bayes
mnb_phrases = MultinomialNB()
mnb_phrases.fit(X_train_phrases, Y_train)
mnb_phrases_score = mnb_phrases.score(X_test_phrases, Y_test)
mnb_phrases_pred = mnb_phrases.predict(X_test_phrases)
print("Multinomial Naive Bayes Score: {}".format(mnb_phrases_score))
mnb_phrases_matrix = confusion_matrix(Y_test, mnb_phrases_pred)
print(mnb_phrases_matrix)

(2400, 12663)
600
[[258  42]
 [159 141]]
0.665
Bernoulli Naive Bayes Score: 0.6816666666666666
[[260  40]
 [151 149]]
Multinomial Naive Bayes Score: 0.6766666666666666
[[253  47]
 [147 153]]


# H: PCA for Bag of Words Model
PCA for bag of words model. The features in the bag of words model have large redundancy.
Implement PCA to reduce the dimension of features calculated in (e) to 10, 50 and 100 respectively. Using these lower-dimensional feature vectors and repeat (f ), (g). Report corresponding clustering and classification results. (Note: You should implement PCA yourself,
but you can use numpy.svd or some other SVD package. Feel free to double-check your PCA
implementation against an existing one)

In [0]:
def pca_with_svd(matrix, num_features):
    # print("*******************")
    # print("*PCA With SVD IPML*")
    # print("*******************")
    matrix_mean = np.mean(matrix.T, axis=1)
    # print("Mean: {}".format(matrix_mean))
    centered = matrix - matrix_mean
    # print("Centered shape: {}".format(centered.shape))

    # TODO: Don't need to center data if we use SVD?
    # U, Sigma, V_T = np.linalg.svd(matrix)
    U, Sigma, V_T = np.linalg.svd(centered)

    top_evals = Sigma[:num_features]
    # print("Eigenvalues shape: {}".format(top_evals.shape))
    top_evectors = V_T[:num_features] # Use transpose of V_T
    # print("Evectors shape: {}".format(top_evectors.shape))

    # Project onto new feature space
    # Y = centered.T.dot(top_evectors.T)
    # result = centered.dot(top_evectors.T)
    result = np.matmul(centered, top_evectors.T)
    # print("result Shape: {}".format(result.shape))
    # return Y.T
    return result

# result_train = pca_with_svd(X_train_bow, 10)
# result_test = pca_with_svd(X_test_bow, 10)

# Perform logistic regression on BoW model after PCA on train and test data
# lr = LogisticRegression(solver="lbfgs")
# lr.fit(result_train, Y_train)
# y_pred = lr.predict(result_test)
# y_score = lr.score(result_test, Y_test)
# print("BOW Logistic Regression Score: {}".format(y_score))

In [0]:
# Fit PCA to features calculated after post processing step
# X_train_bow; X_test_bow; Y_train; Y_test

# *********************************************
# *** TESTING PCA IMPLEMENTATION VS SKLEARN ***
# *********************************************
# Perform PCA on 10, 50, and 100 features
num_features = [10, 50, 100]
# num_features = [10]
use_my_pca = True
for f in num_features:
    print("####################################")
    print("Number of PCA Features: {}".format(f))
    print("####################################")
    print("Using My PCA? {}".format(use_my_pca))
    if use_my_pca:
        X_train_bow_pca = pca_with_svd(X_train_bow, f)
        X_test_bow_pca = pca_with_svd(X_test_bow, f)
        X_train_phrases_pca = pca_with_svd(X_train_phrases, f)
        X_test_phrases_pca = pca_with_svd(X_test_phrases, f)
    else:
        pca = PCA(n_components=f)
        pca.fit(X_train_bow)
        print("Number of PCA components: {}".format(pca.n_components_))
        X_train_bow_pca = pca.transform(X_train_bow)
        X_test_bow_pca = pca.transform(X_test_bow)

    # Perform logistic regression on BoW model after PCA on train and test data
    lr = LogisticRegression(solver="lbfgs")
    lr.fit(X_train_bow_pca, Y_train)
    y_pred = lr.predict(X_test_bow_pca)
    y_score = lr.score(X_test_bow_pca, Y_test)
    lr_bow_matrix = confusion_matrix(Y_test, y_pred)
    print("BOW Logistic Regression Score: {}".format(y_score))
    print("LR Confusion BOW: {}".format(lr_bow_matrix))

    # Bernoulli Naive Bayes
    bnb_bow = BernoulliNB()
    bnb_bow.fit(X_train_bow_pca, Y_train)
    bnb_bow_score = bnb_bow.score(X_test_bow_pca, Y_test)
    bnb_bow_pred = bnb_bow.predict(X_test_bow_pca)
    print("BOW Bernoulli Naive Bayes Score: {}".format(bnb_bow_score))
    bnb_bow_matrix = confusion_matrix(Y_test, bnb_bow_pred)
    print("BNB BoW Matrix: {}".format(bnb_bow_matrix))

    # PCA For Bag of Phrases
    # pca = PCA(n_components=f)
    # pca.fit(X_train_phrases)
    # X_train_phrases_pca = pca.transform(X_train_phrases)
    # X_test_phrases_pca = pca.transform(X_test_phrases)

    # Phrases Logistic Regression 
    lr = LogisticRegression(solver="lbfgs")
    lr.fit(X_train_phrases_pca, Y_train) # Original Y_train is already set
    y_pred = lr.predict(X_test_phrases_pca)
    y_score = lr.score(X_test_phrases_pca, Y_test)
    print("Phrases Logistic Regression Score: {}".format(y_score))
    lr_phrases_matrix = confusion_matrix(Y_test, y_pred)
    print("Phrases LR Phrases Matrix: {}".format(lr_phrases_matrix))

    # Bernoulli Naive Bayes
    bnb_phrases = BernoulliNB()
    bnb_phrases.fit(X_train_phrases_pca, Y_train)
    bnb_phrases_score = bnb_phrases.score(X_test_phrases_pca, Y_test)
    bnb_phrases_pred = bnb_phrases.predict(X_test_phrases_pca)
    print("Phrases Bernoulli Naive Bayes Score: {}".format(bnb_phrases_score))
    bnb_phrases_matrix = confusion_matrix(Y_test, bnb_phrases_pred)
    print("Phrases BNB Phrases Matrix: {}".format(bnb_phrases_matrix))

####################################
Number of PCA Features: 10
####################################
Using My PCA? True


# I: Algorithms Comparison and Analysis
Algorithms comparison and analysis. According to the above results, compare the performances of bag of words, 2-gram and PCA for bag of words. Which method performs best in
the prediction task and why? What do you learn about the language that people use in online reviews (e.g., expressions that will make the posts positive/negative)? Hint: Inspect the
clustering results and the weights learned from logistic regression.

In [56]:
# Look at Logistic Regression Weights
# Map the LR feature weights to the feature matrix indices, which will tell us the
# most discriminative words

# print(lr_bow.coef_.shape)
# print(lr_bow.coef_[0])

NUMBER_OF_FEATURES = 10
print("Looking at {} top features from Logistic Regression Weights".format(NUMBER_OF_FEATURES))

# Get max coefficients
lr_bow_sorted = sorted(lr_bow.coef_[0], key=abs, reverse=True)
# print(lr_bow_sorted)
# Top N values
top_features = lr_bow_sorted[:NUMBER_OF_FEATURES]
# print(top_features)

indicies = []
# Find indicies of top 10 values
for v in top_features:
    idx = np.where(lr_bow.coef_[0] == v)
    # print("Index of {} is {}".format(v, idx[0][0]))
    indicies.append(idx[0][0])

# print(lr_bow.coef_[0][4])

# Map these indicies into our original feature vector for getting the discriminative words
for i in indicies:
    word = unique_words_list[i]
    print("Disciminative word at {} is '{}'".format(i, word))





Looking at 10 top features from Logistic Regression Weights
Disciminative word at 4 is 'great'
Disciminative word at 858 is 'bad'
Disciminative word at 38 is 'love'
Disciminative word at 849 is 'poor'
Disciminative word at 2 is 'excel'
Disciminative word at 2999 is 'delici'
Disciminative word at 86 is 'nice'
Disciminative word at 978 is 'worst'
Disciminative word at 554 is 'amaz'
Disciminative word at 288 is 'fantast'


In [57]:
# Look at weights and words for Multinomial Naive Bayes

NUMBER_OF_FEATURES = 10
print("Looking at {} top features from Multinomial Naive Bayes Weights".format(NUMBER_OF_FEATURES))

# Get max coefficients
nb_bow_sorted = sorted(mnb.coef_[0], reverse=True)
# print(lr_bow_sorted)
# Top N values
top_features = nb_bow_sorted[:NUMBER_OF_FEATURES]
print(top_features)

indicies = []
# Find indicies of top 10 values
for v in top_features:
    idx = np.where(mnb.coef_[0] == v)
    # print("Index of {} is {}".format(v, idx[0][0]))
    indicies.append(idx[0][0])

# print(lr_bow.coef_[0][4])

# Map these indicies into our original feature vector for getting the discriminative words
for i in indicies:
    word = unique_words_list[i]
    print("Disciminative word at {} is '{}'".format(i, word))

Looking at 10 top features from Multinomial Naive Bayes Weights
[-3.7189655614746924, -4.3450077554921505, -4.436750281773733, -4.610593543954772, -4.904374566510776, -5.045830700402312, -5.085370563015636, -5.121787520222974, -5.177101603132868, -5.198353031431271]
Disciminative word at 32 is 'i'
Disciminative word at 6 is 'the'
Disciminative word at 4 is 'great'
Disciminative word at 0 is 'good'
Disciminative word at 44 is ''s'
Disciminative word at 60 is 'this'
Disciminative word at 1369 is 'film'
Disciminative word at 78 is 'it'
Disciminative word at 28 is 'phone'
Disciminative word at 1346 is 'movi'


In [58]:
# Look at Logistic Regression weights for Bag of Phrases Model

# print(lr_phrases.coef_.shape)
# print(lr_phrases.coef_[0])

NUMBER_OF_FEATURES = 10
print("Looking at {} top features from Logistic Regression Weights".format(NUMBER_OF_FEATURES))
# Get max coefficients
lr_phrases_sorted = sorted(lr_phrases.coef_[0], key=abs, reverse=True)
# print(lr_phrases_sorted)
# Top N values
top_features = lr_phrases_sorted[:NUMBER_OF_FEATURES]
# print(top_features)

indicies = []
# Find indicies of top 10 values
for v in top_features:
    idx = np.where(lr_phrases.coef_[0] == v)
    # print("Index of {} is {}".format(v, idx[0][0]))
    indicies.append(idx[0][0])

# print(lr_bow.coef_[0][4])

# Map these indicies into our original feature vector for getting the discriminative words
for i in indicies:
    word = unique_phrases_list[i]
    print("Disciminative phrase at {} is '{}'".format(i, word))

Looking at 10 top features from Logistic Regression Weights
Disciminative phrase at 133 is 'i love'
Disciminative phrase at 29 is 'work great'
Disciminative phrase at 1355 is 'i like'
Disciminative phrase at 21 is 'high recommend'
Disciminative phrase at 245 is 'one best'
Disciminative phrase at 2110 is 'wast time'
Disciminative phrase at 480 is 'great phone'
Disciminative phrase at 9946 is 'the servic'
Disciminative phrase at 317 is 'great product'
Disciminative phrase at 156 is 'i realli'


In [59]:
# Look at Naive Bayes  weights for Bag of Phrases Model

# print(lr_phrases.coef_.shape)
# print(lr_phrases.coef_[0])

NUMBER_OF_FEATURES = 60
print("Looking at {} top features from Naive Bayes Phrases Weights".format(NUMBER_OF_FEATURES))
# Get max coefficients
nb_phrases_sorted = sorted(mnb_phrases.coef_[0], reverse=True)
# print(lr_phrases_sorted)
# Top N values
top_features = nb_phrases_sorted[:NUMBER_OF_FEATURES]
# print(top_features)

indicies = []
# Find indicies of top 10 values
for v in top_features:
    idx = np.where(mnb_phrases.coef_[0] == v)
    # print("Index of {} is {}".format(v, idx[0][0]))
    if idx[0][0] not in indicies:
        indicies.append(idx[0][0])

# print(lr_bow.coef_[0][4])

# Map these indicies into our original feature vector for getting the discriminative words
for i in indicies:
    word = unique_phrases_list[i]
    print("Disciminative phrase at {} is '{}'".format(i, word))

Looking at 60 top features from Naive Bayes Phrases Weights
Disciminative phrase at 133 is 'i love'
Disciminative phrase at 52 is 'i ve'
Disciminative phrase at 29 is 'work great'
Disciminative phrase at 612 is 'it 's'
Disciminative phrase at 1355 is 'i like'
Disciminative phrase at 169 is 'i think'
Disciminative phrase at 21 is 'high recommend'
Disciminative phrase at 388 is 'i 'm'
Disciminative phrase at 245 is 'one best'
Disciminative phrase at 156 is 'i realli'
Disciminative phrase at 10 is 'sound qualiti'
Disciminative phrase at 1090 is 'i n't'
Disciminative phrase at 798 is 'time i'
Disciminative phrase at 317 is 'great product'
Disciminative phrase at 18 is 'veri good'
Disciminative phrase at 19 is 'good qualiti'
