In [16]:
import numpy as np
import collections as collections
from sklearn import datasets
import graphviz
import sklearn.tree
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score, cross_val_predict, KFold
from sklearn.naive_bayes import BernoulliNB
from sklearn import metrics
from StringIO import StringIO
from pymongo import mongo_client
import pickle

In [47]:
#Connect to mongodb
client = mongo_client.MongoClient('localhost:27017')
yelp_db = client.get_database('yelp')
business_coll = yelp_db.get_collection('business_projected')

#Instantiate list to store rows/objects from mongodb in 
rows = list()

#Define column names and indices
cols={ 
    "BikeParking":0, "BusinessAcceptsBitcoin":1, "BusinessAcceptsCreditCards":2, "BusinessParking_Garage":3, 
    "BusinessParking_Lot":4, "BusinessParking_Street":5, "BusinessParking_Valet":6, "BusinessParking_validated":7, 
    "DogsAllowed":8, "GoodForKids":9, "HasTV":10, "NoiseLevel":11, "Open24Hours":12, "OutdoorSeating":13, "Smoking":14, 
    "WheelchairAccessible":15, "WiFi":16, "City":17, "Hours_Friday":18, "Hours_Monday":19, "Hours_Saturday":20, 
    "Hours_Sunday":21, "Hours_Thursday":22, "Hours_Tuesday":23, "Hours_Wednesday":24, "PostalCode":25, "ReviewCount":26, 
    "State":27, "Stars":28 }

#Store target column reference, i.e. Stars
targetCol=28

#Get all the records from the collection and store them in the list of rows
for i in business_coll.find():
    row = [None]*len(cols) #values initialized to None - serves as default when values are missing
    for key in cols.keys(): 
        val=i.get(key)
        row[cols.get(key)] = val
    rows.append(row)

In [48]:
#Store rows in a numpy array
np_records = np.asarray(rows)

In [49]:
#Perform label encoding on the dataset
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
for i in range(0, len(cols),1):
        l = np_records[:, i]
        le.fit(l)
        l[:] = le.transform(l)
np_records = np_records.astype(long)

In [50]:
#Retrieve the labels for the target column
le.inverse_transform(np_records[:,targetCol])
targs=list(le.classes_)
targs[:]=[str(i) for i in targs]
print targs

['1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0']


In [52]:
#Features matrix
x_Matrix = np_records[:, :targetCol]

#Target column
y = np_records[:, targetCol]

#Split dataset into training and test data
x_train, x_test, y_train, y_test = train_test_split(x_Matrix, y, test_size=0.2, random_state=0)

In [53]:
### Initiate classifier
infoGain_clf = sklearn.tree.DecisionTreeClassifier()

#K folds score
cross_validator = KFold(x_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(infoGain_clf, x_train, y_train, cv=cross_validator)


#Fit and evaluate against test set
infoGain_clf.fit(x_train, y_train)
pred=infoGain_clf.predict(x_test)
print infoGain_clf.score(x_test, y_test)


#print the classification report
print metrics.classification_report(y_test, pred, target_names=targs)

[ 0.20508319  0.20668742  0.20840316  0.20792435  0.20437315]
0.208184371808
             precision    recall  f1-score   support

        1.0       0.06      0.08      0.07       544
        1.5       0.04      0.05      0.04       677
        2.0       0.08      0.09      0.09      1683
        2.5       0.14      0.14      0.14      2835
        3.0       0.19      0.19      0.19      4412
        3.5       0.22      0.22      0.22      5620
        4.0       0.26      0.24      0.25      6428
        4.5       0.21      0.20      0.20      4458
        5.0       0.30      0.28      0.29      4671

avg / total       0.21      0.21      0.21     31328



In [54]:
from sklearn.naive_bayes import BernoulliNB
### Initiate classifier
nbc = BernoulliNB()

#K folds score
cross_validator_2 = KFold(x_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(nbc, x_train, y_train, cv=cross_validator_2)

#Fit and evaluate against test set
nbc.fit(x_train, y_train)
pred_2=nbc.predict(x_test)
print metrics.classification_report(y_test, pred_2, target_names=targs)

[ 0.2261102   0.22795467  0.23226399  0.22819408  0.22811428]
             precision    recall  f1-score   support

        1.0       0.06      0.47      0.11       544
        1.5       0.00      0.00      0.00       677
        2.0       0.12      0.00      0.00      1683
        2.5       0.17      0.04      0.06      2835
        3.0       0.22      0.19      0.20      4412
        3.5       0.22      0.04      0.06      5620
        4.0       0.27      0.53      0.36      6428
        4.5       0.00      0.00      0.00      4458
        5.0       0.27      0.52      0.35      4671

avg / total       0.19      0.23      0.17     31328



  'precision', 'predicted', average, warn_for)


In [60]:
#K-nearest neighbors
from sklearn import neighbors
knearest = neighbors.KNeighborsClassifier(n_neighbors=40, weights='uniform') # Keep varying k by changing n_neighbors

#K folds score
cross_validator_knn = KFold(x_train.shape[0], 5, shuffle=True, random_state=33)
print cross_val_score(knearest, x_train, y_train, cv=cross_validator_knn)

knearest.fit(x_train, y_train)

[ 0.23688305  0.24275796  0.24168063  0.23872795  0.2404437 ]


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=40, p=2,
           weights='uniform')

In [58]:
knnPred = knearest.predict(x_test)

In [59]:
print metrics.classification_report(y_test, knnPred, target_names=targs)

             precision    recall  f1-score   support

        1.0       0.11      0.02      0.03       544
        1.5       0.03      0.00      0.00       677
        2.0       0.10      0.02      0.04      1683
        2.5       0.14      0.06      0.09      2835
        3.0       0.20      0.21      0.21      4412
        3.5       0.22      0.29      0.25      5620
        4.0       0.26      0.32      0.29      6428
        4.5       0.22      0.12      0.15      4458
        5.0       0.32      0.50      0.39      4671

avg / total       0.22      0.25      0.22     31328

