In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#######################
__version__ = "1.0"
__date__ = "2016-04-19"
__modified_by__ = "Aditya Tanikanti"
####################################

In [2]:
import os
import numpy as np
import pandas as pd
from utils import load_sparse_csr

In [3]:
from sklearn.cross_validation import KFold
from sklearn import metrics
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import *
from sklearn import preprocessing
from sklearn.preprocessing import *

In [4]:
SEED_VAL = 200;
WORK_DIR = os.getcwd();
data_subset = "_0_1Percent"
YELP_DATA_CSV_DIR = os.path.join(WORK_DIR, "data", "csv")
YELP_DATA_SPARSE_MATRIX_DIR = os.path.join(WORK_DIR, "data", "sparse_matrix")
YELP_DATA_WORD_2_VEC_MODEL_DIR = os.path.join(WORK_DIR, "data", "word2vec_model")

In [5]:
# Read data
read_filename = os.path.join(YELP_DATA_CSV_DIR, 'business_review_user' + data_subset + ".csv")
df_data = pd.read_csv(read_filename, engine='c', encoding='utf-8')

In [6]:
def myNB(bow_feature_matrix_train, y_train, bow_feature_matrix_test, y_test):
    bernoulli_nb_classifier = BernoulliNB()
    bernoulli_nb_classifier.fit((bow_feature_matrix_train), y_train)
    
    bernoulli_nb_prediction = bernoulli_nb_classifier.predict(bow_feature_matrix_train)
    train_accuracy = metrics.accuracy_score(y_train, bernoulli_nb_prediction)
    
    bernoulli_nb_prediction2 = bernoulli_nb_classifier.predict(bow_feature_matrix_test)
    test_accuracy = metrics.accuracy_score(y_test, bernoulli_nb_prediction2)
                           
    return [train_accuracy, test_accuracy]

def myKFoldNB(X, y, n_fold):
    res_nb_train = []
    res_nb_test = []
    
    res_nb_train_avg = 0.0
    res_nb_test_avg = 0.0
    
    res_nb = []
    
    kf = KFold(X.shape[0], n_folds=n_fold)
    
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        res_nb = myNB(X_train, y_train, X_test, y_test)
        
        res_nb_train_avg = res_nb_train_avg + float(res_nb[0])
        res_nb_test_avg = res_nb_test_avg + float(res_nb[1])
        
        res_nb_train.append(res_nb[0])
        res_nb_test.append(res_nb[1])
        
    res_nb_train_avg = res_nb_train_avg / n_fold
    res_nb_test_avg = res_nb_test_avg / n_fold
    
    print "NB training data accuracy: " + str(res_nb_train)
    print "NB training data average accuracy: " + str(res_nb_train_avg)
    print "NB test data accuracy:" + str(res_nb_test)
    print "NB test data average accuracy: " + str(res_nb_test_avg)

y = df_data.review_stars

# Bag of Words

In [7]:
spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords" + data_subset)
feature_matrix_bag_of_words = load_sparse_csr(spare_matrix_file + ".npz")

In [8]:
myKFoldNB(feature_matrix_bag_of_words, y, 5)

NB training data accuracy: [0.61011235955056176, 0.61685393258426968, 0.60786516853932582, 0.60617977528089884, 0.6151685393258427]
NB training data average accuracy: 0.611235955056
NB test data accuracy:[0.4606741573033708, 0.48764044943820223, 0.46966292134831461, 0.49662921348314609, 0.48988764044943822]
NB test data average accuracy: 0.480898876404


# Bag of Words + Hand crafted Features

In [17]:
spare_matrix_file = os.path.join(YELP_DATA_SPARSE_MATRIX_DIR, "bagWords_feat_add" + data_subset)
feature_matrix_bag_of_words_and_hand_craft_features = load_sparse_csr(spare_matrix_file + ".npz")

In [18]:
myKFoldNB(feature_matrix_bag_of_words_and_hand_craft_features, y, 5)

NB training data accuracy: [0.61292134831460676, 0.60617977528089884, 0.60842696629213489, 0.61235955056179781, 0.60561797752808988]
NB training data average accuracy: 0.609101123596
NB test data accuracy:[0.46741573033707867, 0.46516853932584268, 0.42022471910112358, 0.45168539325842699, 0.44269662921348313]
NB test data average accuracy: 0.449438202247


# Word embedding

In [9]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_feature_matrix" + data_subset+ ".csv")
feature_matrix_word2vec = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')  

In [10]:
myKFoldNB(feature_matrix_word2vec, y, 5)

NB training data accuracy: [0.38595505617977527, 0.39101123595505616, 0.39438202247191012, 0.38370786516853933, 0.40168539325842695]
NB training data average accuracy: 0.391348314607
NB test data accuracy:[0.39101123595505616, 0.35056179775280899, 0.36179775280898874, 0.38202247191011235, 0.3707865168539326]
NB test data average accuracy: 0.371235955056


# Word 2 Vec + Hand crafted Features

In [23]:
word2vec_feature_matrix_file = os.path.join(YELP_DATA_WORD_2_VEC_MODEL_DIR, "word2vec_add_feature_matrix" + data_subset+ ".csv")
feature_matrix_word2vec_and_hand_craft_features = np.genfromtxt(word2vec_feature_matrix_file, delimiter=',')  

In [24]:
myKFoldNB(feature_matrix_word2vec_and_hand_craft_features, y, 5)

NB training data accuracy: [0.45955056179775283, 0.45842696629213481, 0.46292134831460674, 0.46011235955056179, 0.46741573033707867]
NB training data average accuracy: 0.461685393258
NB test data accuracy:[0.47415730337078654, 0.48089887640449436, 0.43820224719101125, 0.47191011235955055, 0.43820224719101125]
NB test data average accuracy: 0.460674157303


# Hand crafted features

In [25]:
feature_matrix_hand_craft_features = feature_matrix_word2vec_and_hand_craft_features[:,100:104]

In [26]:
myKFoldNB(feature_matrix_hand_craft_features, y, 5)

NB training data accuracy: [0.45449438202247189, 0.45280898876404496, 0.46179775280898877, 0.45449438202247189, 0.46179775280898877]
NB training data average accuracy: 0.457078651685
NB test data accuracy:[0.46741573033707867, 0.47415730337078654, 0.43820224719101125, 0.46741573033707867, 0.43820224719101125]
NB test data average accuracy: 0.457078651685
