# Capstone Project

In [442]:
import requests
import random
import pandas as pd
import numpy as np
import sqlite3 as lite
from pandas.io.json import json_normalize
import matplotlib.pyplot as plt
import time
import datetime
from dateutil.parser import parse
import collections
import json
import ijson
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold

## Iteratively load json's line by line

- Download the dataset from the Yelp challenge site:
https://www.yelp.com/dataset_challenge/dataset

## Create dataset from first 25,000 reviews:

In [443]:
sample_size = 50000

In [444]:
data = []
data_test = []
y = []
X = []
with open('yelp_academic_dataset_review.json','rU') as reviews:
    i = 0
    while i < sample_size:
        first_line = json.loads(reviews.readline())
        if first_line['stars'] != 3:
            if first_line['stars'] < 3:
                first_line['stars'] = 0
            else:
                first_line['stars'] = 1
            data.append([first_line['stars'], first_line['text']])
            y.append(first_line['stars'])
            X.append(first_line['text'])
        i += 1
X = np.array(X)
y = np.array(y)

## Sort reviews by star number, make dataframes:

In [445]:
df = pd.DataFrame(data, columns = ['stars','text']).sort('stars')
df.reset_index(drop=True, inplace=True)

In [446]:
df.head()

Unnamed: 0,stars,text
0,0,"I have this place one star, because it can't g..."
1,0,I was quoted a price for a new pair of glasses...
2,0,There is no doubt that the ambiance is better ...
3,0,I wouldn't even give them one star at this loc...
4,0,I went into Kane and Company for the first tim...


## Clean text for Bag-of-Words:

In [447]:
example1 = BeautifulSoup(X[0])
example1.get_text()

u'Mr Hoagie is an institution. Walking in, it does seem like a throwback to 30 years ago, old fashioned menu board, booths out of the 70s, and a large selection of food. Their speciality is the Italian Hoagie, and it is voted the best in the area year after year. I usually order the burger, while the patties are obviously cooked from frozen, all of the other ingredients are very fresh. Overall, its a good alternative to Subway, which is down the road.'

In [448]:
# example1 = BeautifulSoup(df['text'][75])
# example1.get_text()

In [449]:
def review_to_words(raw_review):
    #     1) Remove HTML:
    review_text = BeautifulSoup(raw_review).get_text()
    #     2) Remove non-letters:
    letters_only = re.sub('[^a-zA-Z]',' ', review_text)
    #     3) Convert to lower case, split into words:
    words = letters_only.lower().split()
    #     4) Convert stopwords to set:
    stops = set(stopwords.words('english'))
    #     5) Remove stopwords:
    meaningful_words = [w for w in words if not w in stops]
    #     6) Join words back into one string separated by space:
    return( ' '.join(meaningful_words))

In [450]:
clean_example = review_to_words(X[0])
clean_example

u'mr hoagie institution walking seem like throwback years ago old fashioned menu board booths large selection food speciality italian hoagie voted best area year year usually order burger patties obviously cooked frozen ingredients fresh overall good alternative subway road'

In [451]:
num_reviews = X.size
num_reviews

42209

In [452]:
print "Cleaning and parsing the Yelp reviews...\n"
clean_reviews = []
for i in xrange(0, num_reviews):
    if (i+1)%1000 == 0:
        print "Review %d of %d\n" % (i+1,num_reviews)
    clean_reviews.append(review_to_words(X[i]))

Cleaning and parsing the Yelp reviews...

Review 1000 of 42209

Review 2000 of 42209

Review 3000 of 42209

Review 4000 of 42209

Review 5000 of 42209

Review 6000 of 42209

Review 7000 of 42209

Review 8000 of 42209

Review 9000 of 42209

Review 10000 of 42209

Review 11000 of 42209

Review 12000 of 42209

Review 13000 of 42209

Review 14000 of 42209

Review 15000 of 42209

Review 16000 of 42209

Review 17000 of 42209

Review 18000 of 42209

Review 19000 of 42209

Review 20000 of 42209

Review 21000 of 42209

Review 22000 of 42209

Review 23000 of 42209

Review 24000 of 42209

Review 25000 of 42209

Review 26000 of 42209

Review 27000 of 42209

Review 28000 of 42209

Review 29000 of 42209

Review 30000 of 42209

Review 31000 of 42209

Review 32000 of 42209

Review 33000 of 42209

Review 34000 of 42209

Review 35000 of 42209

Review 36000 of 42209

Review 37000 of 42209

Review 38000 of 42209

Review 39000 of 42209

Review 40000 of 42209

Review 41000 of 42209

Review 42000 of 42209



In [453]:
clean_reviews[0]

u'mr hoagie institution walking seem like throwback years ago old fashioned menu board booths large selection food speciality italian hoagie voted best area year year usually order burger patties obviously cooked frozen ingredients fresh overall good alternative subway road'

## Vectorize word counts, train on training set:

In [454]:
print 'Creating the bag of words...\n'
vectorizer = CountVectorizer(analyzer='word',
                             tokenizer=None,
                             preprocessor=None,
                             stop_words=None,
                             max_features=5000)

data_features = vectorizer.fit_transform(clean_reviews)
data_features = data_features.toarray()

Creating the bag of words...



## Vectorize testing set words:

In [455]:
data_features.shape

(42209, 5000)

In [456]:
data_features.shape

(42209, 5000)

In [457]:
vocab = vectorizer.get_feature_names()
vocab[1:20]

[u'able',
 u'absolute',
 u'absolutely',
 u'ac',
 u'accept',
 u'acceptable',
 u'accepted',
 u'access',
 u'accessible',
 u'accessories',
 u'accident',
 u'accidentally',
 u'accommodate',
 u'accommodating',
 u'accompanied',
 u'accompanying',
 u'according',
 u'account',
 u'accurate']

In [458]:
dist = np.sum(data_features, axis=0)
for tag, count in zip(vocab[1:20], dist):
    print count, tag

87 able
1726 absolute
262 absolutely
1625 ac
127 accept
181 acceptable
134 accepted
82 access
275 accessible
86 accessories
75 accident
105 accidentally
62 accommodate
156 accommodating
355 accompanied
81 accompanying
48 according
108 account
197 accurate


## Partition the data into folds

In [459]:
skf = StratifiedKFold(y, n_folds=10)

In [460]:
# for train, test in skf:
# #     print type(train)
# #     print type(test)
#     print 'TRAIN:', train, 'TEST:', test
#     X_train, X_test = X[train], X[test]
#     y_train, y_test = y[train], y[test]

## Generate random forest model

In [461]:
print 'Training the random forest...'

forest = RandomForestClassifier(n_estimators=100)

Training the random forest...


In [462]:
for train, test in skf:
    print 'Training the random forest...'
    forest = forest.fit(data_features[train],
                    y[train])
#     print data_features[train]

Training the random forest...
Training the random forest...
Training the random forest...
Training the random forest...
Training the random forest...
Training the random forest...
Training the random forest...
Training the random forest...
Training the random forest...
Training the random forest...


In [463]:
precision_list = []
recall_list = []
output_list = []
f1_score_list = []
confusion_list = []
for train, test in skf:
    print 'Testing using the random forest...'
    result = forest.predict(data_features[test])
    precision = metrics.precision_score(y[test],result)
    precision_list.append(precision)
    recall = metrics.recall_score(y[test],result)
    recall_list.append(recall)
    f1_score = metrics.f1_score(y[test],result)
    f1_score_list.append(f1_score)
    confusion = metrics.confusion_matrix(y[test],result)
    confusion_list.append(confusion)
    output = pd.DataFrame(data={'text':X[test],'stars_pred':result})
    output_list.append(output)
    print 'PRECISION:', precision, 'RECALL:', recall, 'F1 SCORE:', f1_score, 'CONFUSION MATRIX:', confusion

Testing using the random forest...
PRECISION: 1.0 RECALL: 1.0 F1 SCORE: 1.0 CONFUSION MATRIX: [[1078    0]
 [   0 3144]]
Testing using the random forest...
PRECISION: 1.0 RECALL: 1.0 F1 SCORE: 1.0 CONFUSION MATRIX: [[1078    0]
 [   0 3144]]
Testing using the random forest...
PRECISION: 0.999682034976 RECALL: 1.0 F1 SCORE: 0.999840992209 CONFUSION MATRIX: [[1077    1]
 [   0 3144]]
Testing using the random forest...
PRECISION: 1.0 RECALL: 1.0 F1 SCORE: 1.0 CONFUSION MATRIX: [[1078    0]
 [   0 3143]]
Testing using the random forest...
PRECISION: 0.999681832644 RECALL: 0.999681832644 F1 SCORE: 0.999681832644 CONFUSION MATRIX: [[1077    1]
 [   1 3142]]
Testing using the random forest...
PRECISION: 1.0 RECALL: 1.0 F1 SCORE: 1.0 CONFUSION MATRIX: [[1078    0]
 [   0 3143]]
Testing using the random forest...
PRECISION: 1.0 RECALL: 1.0 F1 SCORE: 1.0 CONFUSION MATRIX: [[1077    0]
 [   0 3143]]
Testing using the random forest...
PRECISION: 1.0 RECALL: 0.999363665288 F1 SCORE: 0.999681731381 

## Aggregate statistics on each fold

In [464]:
mean_precision = np.mean(precision_list)
mean_recall = np.mean(recall_list)
mean_f1 = np.mean(f1_score_list)
std_precision = np.std(precision_list)
std_recall = np.std(recall_list)
std_f1 = np.std(f1_score_list)
print 'MEAN PRECISION:', mean_precision
print 'STD PRECISION:', std_precision
print 'MEAN RECALL:', mean_recall
print 'STD RECALL:', std_recall
print 'MEAN F1 SCORE:', mean_f1
print 'STD F1 SCORE', std_f1
print 'CONFUSION MATRIX LIST:', confusion_list

MEAN PRECISION: 0.989199113503
STD PRECISION: 0.0321908597979
MEAN RECALL: 0.996977410118
STD RECALL: 0.00875191592932
MEAN F1 SCORE: 0.99292457085
STD F1 SCORE 0.0209615152336
CONFUSION MATRIX LIST: [array([[1078,    0],
       [   0, 3144]]), array([[1078,    0],
       [   0, 3144]]), array([[1077,    1],
       [   0, 3144]]), array([[1078,    0],
       [   0, 3143]]), array([[1077,    1],
       [   1, 3142]]), array([[1078,    0],
       [   0, 3143]]), array([[1077,    0],
       [   0, 3143]]), array([[1077,    0],
       [   2, 3141]]), array([[1077,    0],
       [   0, 3143]]), array([[ 710,  367],
       [  92, 3051]])]


## Evaluate model predictions:

- Mean, STD of Sensitivity and specificity analysis, check bias