In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Grab and process the raw data.
data_path = ("yelp_labelled.txt")
review_raw = pd.read_csv(data_path, delimiter= '\t', header=None)
review_raw.columns = ['review', 'good_rating']

# Questions to answer:
- Do any of your classifiers seem to overfit?
The two classifiers that appear to overfit are: 
- File #2, where I only used features that have a strong correlation with food (words like tasty, flavorful, or delicious).  With 20% Holdout: 0.565, Testing on Sample: 0.529
File #4, interestingly. I only trained on the words 'good' and 'great' (which can also be turned into 'not good' or 'not great', which I did in file #5... w/ a very slight difference. With 20% Holdout: 0.59. Testing on Sample: 0.586). 

- Which seem to perform the best? Why?
The best performance did come from my very first set, the classifiers listed below. With 20% Holdout: 0.64
Testing on Sample: 0.655.

- Which features seemed to be most impactful to performance?
The greatest impact I could make on the classifier was by removing, jointly, the words 'best','perfect', and 'loved'. Together this resulted in a .62/.649 result. 


In [4]:
review_raw

Unnamed: 0,review,good_rating
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
5,Now I am getting angry and I want my damn pho.,0
6,Honeslty it didn't taste THAT fresh.),0
7,The potatoes were like rubber and you could te...,0
8,The fries were great too.,1
9,A great touch.,1


# Note: I took out spaces, as well as using all_caps as an identifier. all_caps is more often associated with negative reviews.

In [5]:
# Enumerate our spammy keywords.
keywords = ['great' , 'amazing', 'delicious','loved','recommend','perfect','tasty','great','thumbs up','friendly',
            'incredible','fantastic','satisfied','inviting','welcome','special','flavourful','flavorful','outstanding',
           'exceptional','hit','perfect','fantastic','best','happy','perfect','5 stars']


for key in keywords:
    review_raw[str(key)] = review_raw.review.str.contains(str(key), case=True)

In [6]:
review_raw[str(key)].value_counts()

False    997
True       3
Name: 5 stars, dtype: int64

In [7]:
review_raw['good_rating'].value_counts()

1    500
0    500
Name: good_rating, dtype: int64

In [8]:
review_raw.head()

Unnamed: 0,review,good_rating,great,amazing,delicious,loved,recommend,perfect,tasty,thumbs up,...,welcome,special,flavourful,flavorful,outstanding,exceptional,hit,best,happy,5 stars
0,Wow... Loved this place.,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,Crust is not good.,0,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Not tasty and the texture was just nasty.,0,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,Stopped by during the late May bank holiday of...,1,False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The selection on the menu was great and so wer...,1,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# sms_raw['good_rating'] = (sms_raw['good_rating'] == 1)
# review_raw['rating'] = (review_raw['rating'] == 'rating')
data = review_raw[keywords]
target = review_raw['good_rating']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)

In [10]:
type(y_pred)  # (y_pred == 1)
# y_pred.sum(y_pred == 1)
y_pred[0], y_pred[1], y_pred[2], y_pred[3] 
print("Percentage Good Ratings Predicted: ",(y_pred == 1).mean())
print("Percentage Bad Ratings Predicted: ",(y_pred == 0).mean())

Percentage Good Ratings Predicted:  0.175
Percentage Bad Ratings Predicted:  0.825


In [11]:
print("Target Values: ")
target.value_counts()


Target Values: 


1    500
0    500
Name: good_rating, dtype: int64

In [12]:
type(target)

pandas.core.series.Series

In [13]:
target.value_counts(sort=True)

1    500
0    500
Name: good_rating, dtype: int64

In [14]:
Actual_Outcomes = target.value_counts(normalize=False)

In [15]:
print(Actual_Outcomes)

1    500
0    500
Name: good_rating, dtype: int64


In [16]:
type(Actual_Outcomes)

pandas.core.series.Series

In [17]:
Actual_Outcomes[0]

500

In [18]:
Actual_Outcomes[1]

500

In [19]:
type(y_pred)

numpy.ndarray

In [20]:
# For an ndarray... need to count
pred_true = (y_pred == True).sum()
print(pred_true)

175


In [21]:
# For an ndarray... need to count
pred_false = (y_pred == False).sum()
print(pred_false)

825


In [22]:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[490,  10],
       [335, 165]])

In [23]:

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 345


In [24]:
print((target != y_pred).sum())

345


In [25]:
(target != y_pred).sum()/(pred_true+pred_false)

0.345

In [26]:
1 - (target != y_pred).sum()/(pred_true+pred_false)

0.655

In [27]:
# Where the target and the predictor were both true
(target == y_pred).sum()

655

In [28]:
# Where the target was not equal to the predictor (where the predictor was wrong)
(target != y_pred).sum()

345

# Stopping here to find variables and predictor for my analysis

In [29]:
good_ratings_sum = Actual_Outcomes[0]
bad_ratings_sum = Actual_Outcomes[1]

print("Real Good Ratings: ", good_ratings_sum)
print("Real Bad Ratings: ", bad_ratings_sum)

Real Good Ratings:  500
Real Bad Ratings:  500


In [30]:
print("Predicted Correctly: ", pred_true)
print("Predicted Incorrectly: ", pred_false)

Predicted Correctly:  175
Predicted Incorrectly:  825


In [31]:
# Number of outcomes where model failed
wrong_prediction = (target != y_pred).sum()
print("wrong prediction: ",wrong_prediction)

wrong prediction:  345


In [32]:
inaccuracy = wrong_prediction/num_messages
accuracy = 1 - wrong_prediction/num_messages

print("% Accuracy =: %", accuracy*100)

NameError: name 'num_messages' is not defined

# Just hold on to the code below

In [None]:
# Total # of Spams - Correctly Identified Spams
spam_true = (sms_raw['spam'] == True).sum()
print(spam_true)

In [33]:

# Display our results.
print("Number of correctly labeled points out of a total {} points : {}".format(
    data.shape[0],
    (target == y_pred).sum()
))

Number of correctly labeled points out of a total 1000 points : 655


In [34]:
((y_pred == True) == (Actual_Outcomes[1] == True)).sum()

825

In [35]:
# Back to the confusion matrix:
from sklearn.metrics import confusion_matrix
confusion_matrix(target, y_pred)

array([[490,  10],
       [335, 165]])

# Hold on to the code below

In [36]:
# 604 - 549 = 55
# total wrong predictions - correctly identified spam = incorrectly identified as spam
# total wrong predictions are (target != y_pred).sum() 
# correctly identified spam: target is true when you have spam
# y_pred is true when predicted correctly
# so: when target == true and y_pred == true, you have correctly identified spam
total_errors = (target != y_pred).sum() 
print(total_errors)

correctly_spam = (target == y_pred).sum()
print("Correctly identified: ", correctly_spam)
# and 
print("False y_pred, wrong model: ", (y_pred == False).sum())
print("True y_pred, correct model: ", (y_pred == True).sum())



345
Correctly identified:  655
False y_pred, wrong model:  825
True y_pred, correct model:  175


In [37]:
# Actual Errors
((target == True) != (y_pred == True)).sum()

# Errors that were originally spam


345

In [38]:
# Actually spam and correctly identified as such
pos_correctly_identified = ((target == True) & (y_pred == True)).sum()
print(pos_correctly_identified)

165


In [39]:
# Actually spam and misidentified as ham
false_neg = ((target == True) & (y_pred == False)).sum()
print(false_neg)

335


In [40]:
# Not spam, and correctly not identified as such
neg_correctly_identified = ((target == False) & (y_pred == False)).sum()
print(neg_correctly_identified)

490


In [41]:
# Not spam, and identified as such
false_pos = ((target == False) & (y_pred == True)).sum()
print(false_pos)

10


In [42]:
target.count()

1000

In [43]:
from IPython.display import display

In [44]:
confused_df = {'col1': [neg_correctly_identified, false_neg], 'col2': [false_pos, pos_correctly_identified ]}
df = pd.DataFrame(confused_df)

# Interpreting the confusion matrix:
- 490: Bad ratings correctly identified
-  10: Bad ratings incorrectly identified as good
-  335: Good ratings incorrectly identified as bad
- 165: Good ratings correctly identified as good

In [45]:
display(df)

Unnamed: 0,col1,col2
0,490,10
1,335,165


# Now, creating the holdout groups

In [46]:
# Test your model with different holdout groups.
# Took out random seed of 20

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.665
Testing on Sample: 0.655


# Now, looking at cross validation:
 The array that cross_val_score returns is a series of accuracy scores with a different hold out group each time. If our model is overfitting at a variable amount, those scores will fluctuate. Instead, ours are relatively consistent.

In [47]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.65, 0.6 , 0.68, 0.62, 0.7 , 0.64, 0.66, 0.63, 0.6 , 0.66])

# Need to code up my own cross validation here:
