In [45]:
#Initializing
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

***
# First Run (Yelp Reviews)

In [46]:
# Import the raw data.
df = pd.read_csv('/Users/Kevin/Files/Thinkful/Data Files/sentiment labelled sentences/yelp_labelled.txt', sep="	", header=None)
df.columns = ["review", "posneg"]

# creating pos and neg columns
df['pos'] = (df['posneg'] == 1)
df['neg'] = (df['posneg'] == 0)

# making reviews lowercase to create keyword features columns
df['review'].str.lower();

#Adding keyword features
# other keywords: ('wow', 'correct', 'love', 'best', 'great', 'recommend', 'excellent')
pos_keywords = ['good', 'amazing']

#creating each keyword feature
for key in pos_keywords:
    df[str(key)] = df.review.str.contains(str(key),case=False)
    
#Variables for model
data = df[pos_keywords]
target = df['pos']

from sklearn.naive_bayes import BernoulliNB
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 432


# Evaluation

In [56]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(bnb, data, target, cv=10)
print('Cross Validation Scores = {}'.format(cvs))

print()

from sklearn.metrics import confusion_matrix
cf_mx = confusion_matrix(target, y_pred)

print('Confusion Matrix = \n {}'.format(cf_mx))

print()

#total
total = data.shape[0]
print('Total = {}'.format(total))

#total yes 
total_yes = (target == y_pred).sum()
print('Total Yes = {}'.format(total_yes))

#total no
total_no = (target != y_pred).sum()
print('Total No = {}'.format(total_no))

print()

# Total Actual - Yes
actual_yes = (target == True).sum()
print('Actual Yes = {}'.format(actual_yes))

# Total Actual - No
actual_no = (target == False).sum()
print('Actual No = {}'.format(actual_no))

# Total Prediction - Yes
predicted_yes = (y_pred == True).sum()
print('Predicted Yes = {}'.format(predicted_yes))

# Total Prediction - No
predicted_no = (y_pred == False).sum()
print('Predicted No = {}'.format(predicted_no))

print()

#true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
tp = np.logical_and(target == True, target == y_pred).sum()
print('TP = {}'.format(tp))

#true negatives (TN): We predicted no, and they don't have the disease.
tn = np.logical_and(target == False, target == y_pred).sum()
print('TN = {}'.format(tn))

#false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
fp = np.logical_and(target == False, target != y_pred).sum()
print('FP = {}'.format(fp))

#false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")
fn = np.logical_and(target == True, target != y_pred).sum()
print('FN = {}'.format(fn))

print()

#Accuracy: Overall, how often is the classifier correct?
#(TP+TN)/total 
accuracy = (tp + tn)/total
print('Accuracy = {:.{prec}}'.format(accuracy, prec=2))

#Misclassification Rate ("Error Rate"): Overall, how often is it wrong? (equivalent to 1 minus Accuracy) 
#(FP+FN)/total 
error_rate = (fp + fn)/ total
print('Error Rate = {:.{prec}}'.format(error_rate, prec=2))

#True Positive Rate ("Sensitivity"): When it's actually yes, how often does it predict yes? (also known as "Recall")
#TP/actual yes 
sensitivity = tp / actual_yes
print('Sensitivity = {:.{prec}}'.format(sensitivity, prec=2))

#False Positive Rate: When it's actually no, how often does it predict yes?
#FP/actual no 
false_positive = fp / actual_no
print('False Positive Rate = {:.{prec}}'.format(false_positive, prec=2))

#True Negative Rate("Specificity"): When it's actually no, how often does it predict no? (equivalent to 1 minus False Positive Rate)
#TN/actual no 
specificity = tn / actual_no
print('Specificity = {:.{prec}}'.format(specificity, prec=2))

#Precision: When it predicts yes, how often is it correct?
#TP/predicted yes 
precision = tp / predicted_yes
print('Precision = {:.{prec}}'.format(precision, prec=2))

#Prevalence: How often does the yes condition actually occur in our sample?
#actual yes/total 
prevalence = actual_yes / total
print('Prevalance = {:.{prec}}'.format(prevalence, prec=2))

Cross Validation Scores = [0.64 0.69 0.69 0.73 0.62 0.61 0.67 0.66 0.65 0.69]

Confusion Matrix = 
 [[458  42]
 [289 211]]

Total = 1000
Total Yes = 669
Total No = 331

Actual Yes = 500
Actual No = 500
Predicted Yes = 253
Predicted No = 747

TP = 211
TN = 458
FP = 42
FN = 289

Accuracy = 0.67
Error Rate = 0.33
Sensitivity = 0.42
False Positive Rate = 0.084
Specificity = 0.92
Precision = 0.83
Prevalance = 0.5


***
# Second Run (Yelp Reviews)

Adding 'great' and 'recommend'

In [48]:
# Import the raw data.
df = pd.read_csv('/Users/Kevin/Files/Thinkful/Data Files/sentiment labelled sentences/yelp_labelled.txt', sep="	", header=None)
df.columns = ["review", "posneg"]

# creating pos and neg columns
df['pos'] = (df['posneg'] == 1)
df['neg'] = (df['posneg'] == 0)

# making reviews lowercase to create keyword features columns
df['review'].str.lower();

#Adding keyword features
# other keywords: ('wow', 'correct', 'love', 'best', 'excellent')
pos_keywords = ['good', 'amazing', 'great', 'recommend']

#creating each keyword feature
for key in pos_keywords:
    df[str(key)] = df.review.str.contains(str(key),case=False)
    
#Variables for model
data = df[pos_keywords]
target = df['pos']

from sklearn.naive_bayes import BernoulliNB
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 370


# Evaluation

In [57]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(bnb, data, target, cv=10)
print('Cross Validation Scores = {}'.format(cvs))

print()

from sklearn.metrics import confusion_matrix
cf_mx = confusion_matrix(target, y_pred)

print('Confusion Matrix = \n {}'.format(cf_mx))

print()

#total
total = data.shape[0]
print('Total = {}'.format(total))

#total yes 
total_yes = (target == y_pred).sum()
print('Total Yes = {}'.format(total_yes))

#total no
total_no = (target != y_pred).sum()
print('Total No = {}'.format(total_no))

print()

# Total Actual - Yes
actual_yes = (target == True).sum()
print('Actual Yes = {}'.format(actual_yes))

# Total Actual - No
actual_no = (target == False).sum()
print('Actual No = {}'.format(actual_no))

# Total Prediction - Yes
predicted_yes = (y_pred == True).sum()
print('Predicted Yes = {}'.format(predicted_yes))

# Total Prediction - No
predicted_no = (y_pred == False).sum()
print('Predicted No = {}'.format(predicted_no))

print()

#true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
tp = np.logical_and(target == True, target == y_pred).sum()
print('TP = {}'.format(tp))

#true negatives (TN): We predicted no, and they don't have the disease.
tn = np.logical_and(target == False, target == y_pred).sum()
print('TN = {}'.format(tn))

#false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
fp = np.logical_and(target == False, target != y_pred).sum()
print('FP = {}'.format(fp))

#false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")
fn = np.logical_and(target == True, target != y_pred).sum()
print('FN = {}'.format(fn))

print()

#Accuracy: Overall, how often is the classifier correct?
#(TP+TN)/total 
accuracy = (tp + tn)/total
print('Accuracy = {:.{prec}}'.format(accuracy, prec=2))

#Misclassification Rate ("Error Rate"): Overall, how often is it wrong? (equivalent to 1 minus Accuracy) 
#(FP+FN)/total 
error_rate = (fp + fn)/ total
print('Error Rate = {:.{prec}}'.format(error_rate, prec=2))

#True Positive Rate ("Sensitivity"): When it's actually yes, how often does it predict yes? (also known as "Recall")
#TP/actual yes 
sensitivity = tp / actual_yes
print('Sensitivity = {:.{prec}}'.format(sensitivity, prec=2))

#False Positive Rate: When it's actually no, how often does it predict yes?
#FP/actual no 
false_positive = fp / actual_no
print('False Positive Rate = {:.{prec}}'.format(false_positive, prec=2))

#True Negative Rate("Specificity"): When it's actually no, how often does it predict no? (equivalent to 1 minus False Positive Rate)
#TN/actual no 
specificity = tn / actual_no
print('Specificity = {:.{prec}}'.format(specificity, prec=2))

#Precision: When it predicts yes, how often is it correct?
#TP/predicted yes 
precision = tp / predicted_yes
print('Precision = {:.{prec}}'.format(precision, prec=2))

#Prevalence: How often does the yes condition actually occur in our sample?
#actual yes/total 
prevalence = actual_yes / total
print('Prevalance = {:.{prec}}'.format(prevalence, prec=2))

Cross Validation Scores = [0.64 0.69 0.69 0.73 0.62 0.61 0.67 0.66 0.65 0.69]

Confusion Matrix = 
 [[458  42]
 [289 211]]

Total = 1000
Total Yes = 669
Total No = 331

Actual Yes = 500
Actual No = 500
Predicted Yes = 253
Predicted No = 747

TP = 211
TN = 458
FP = 42
FN = 289

Accuracy = 0.67
Error Rate = 0.33
Sensitivity = 0.42
False Positive Rate = 0.084
Specificity = 0.92
Precision = 0.83
Prevalance = 0.5


***
# Third Run (Yelp Reviews)

Adding 'best' and 'excellent'

In [50]:
# Import the raw data.
df = pd.read_csv('/Users/Kevin/Files/Thinkful/Data Files/sentiment labelled sentences/yelp_labelled.txt', sep="	", header=None)
df.columns = ["review", "posneg"]

# creating pos and neg columns
df['pos'] = (df['posneg'] == 1)
df['neg'] = (df['posneg'] == 0)

# making reviews lowercase to create keyword features columns
df['review'].str.lower();

#Adding keyword features
# other keywords: ('wow', 'correct', 'love')
pos_keywords = ['good', 'amazing', 'great', 'recommend', 'best', 'excellent']

#creating each keyword feature
for key in pos_keywords:
    df[str(key)] = df.review.str.contains(str(key),case=False)
    
#Variables for model
data = df[pos_keywords]
target = df['pos']

from sklearn.naive_bayes import BernoulliNB
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 353


# Evaluation

In [58]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(bnb, data, target, cv=10)
print('Cross Validation Scores = {}'.format(cvs))

print()

from sklearn.metrics import confusion_matrix
cf_mx = confusion_matrix(target, y_pred)

print('Confusion Matrix = \n {}'.format(cf_mx))

print()

#total
total = data.shape[0]
print('Total = {}'.format(total))

#total yes 
total_yes = (target == y_pred).sum()
print('Total Yes = {}'.format(total_yes))

#total no
total_no = (target != y_pred).sum()
print('Total No = {}'.format(total_no))

print()

# Total Actual - Yes
actual_yes = (target == True).sum()
print('Actual Yes = {}'.format(actual_yes))

# Total Actual - No
actual_no = (target == False).sum()
print('Actual No = {}'.format(actual_no))

# Total Prediction - Yes
predicted_yes = (y_pred == True).sum()
print('Predicted Yes = {}'.format(predicted_yes))

# Total Prediction - No
predicted_no = (y_pred == False).sum()
print('Predicted No = {}'.format(predicted_no))

print()

#true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
tp = np.logical_and(target == True, target == y_pred).sum()
print('TP = {}'.format(tp))

#true negatives (TN): We predicted no, and they don't have the disease.
tn = np.logical_and(target == False, target == y_pred).sum()
print('TN = {}'.format(tn))

#false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
fp = np.logical_and(target == False, target != y_pred).sum()
print('FP = {}'.format(fp))

#false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")
fn = np.logical_and(target == True, target != y_pred).sum()
print('FN = {}'.format(fn))

print()

#Accuracy: Overall, how often is the classifier correct?
#(TP+TN)/total 
accuracy = (tp + tn)/total
print('Accuracy = {:.{prec}}'.format(accuracy, prec=2))

#Misclassification Rate ("Error Rate"): Overall, how often is it wrong? (equivalent to 1 minus Accuracy) 
#(FP+FN)/total 
error_rate = (fp + fn)/ total
print('Error Rate = {:.{prec}}'.format(error_rate, prec=2))

#True Positive Rate ("Sensitivity"): When it's actually yes, how often does it predict yes? (also known as "Recall")
#TP/actual yes 
sensitivity = tp / actual_yes
print('Sensitivity = {:.{prec}}'.format(sensitivity, prec=2))

#False Positive Rate: When it's actually no, how often does it predict yes?
#FP/actual no 
false_positive = fp / actual_no
print('False Positive Rate = {:.{prec}}'.format(false_positive, prec=2))

#True Negative Rate("Specificity"): When it's actually no, how often does it predict no? (equivalent to 1 minus False Positive Rate)
#TN/actual no 
specificity = tn / actual_no
print('Specificity = {:.{prec}}'.format(specificity, prec=2))

#Precision: When it predicts yes, how often is it correct?
#TP/predicted yes 
precision = tp / predicted_yes
print('Precision = {:.{prec}}'.format(precision, prec=2))

#Prevalence: How often does the yes condition actually occur in our sample?
#actual yes/total 
prevalence = actual_yes / total
print('Prevalance = {:.{prec}}'.format(prevalence, prec=2))

Cross Validation Scores = [0.64 0.69 0.69 0.73 0.62 0.61 0.67 0.66 0.65 0.69]

Confusion Matrix = 
 [[458  42]
 [289 211]]

Total = 1000
Total Yes = 669
Total No = 331

Actual Yes = 500
Actual No = 500
Predicted Yes = 253
Predicted No = 747

TP = 211
TN = 458
FP = 42
FN = 289

Accuracy = 0.67
Error Rate = 0.33
Sensitivity = 0.42
False Positive Rate = 0.084
Specificity = 0.92
Precision = 0.83
Prevalance = 0.5


***
# Fourth Run (Yelp Reviews)

Adding 'correct' and 'yes'

In [52]:
# Import the raw data.
df = pd.read_csv('/Users/Kevin/Files/Thinkful/Data Files/sentiment labelled sentences/yelp_labelled.txt', sep="	", header=None)
df.columns = ["review", "posneg"]

# creating pos and neg columns
df['pos'] = (df['posneg'] == 1)
df['neg'] = (df['posneg'] == 0)

# making reviews lowercase to create keyword features columns
df['review'].str.lower();

#Adding keyword features
# other keywords: ('wow', 'love')
pos_keywords = ['good', 'amazing', 'great', 'recommend', 'best', 'excellent', 'correct', 'yes']

#creating each keyword feature
for key in pos_keywords:
    df[str(key)] = df.review.str.contains(str(key),case=False)
    
#Variables for model
data = df[pos_keywords]
target = df['pos']

from sklearn.naive_bayes import BernoulliNB
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 353


# Evaluation

In [59]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(bnb, data, target, cv=10)
print('Cross Validation Scores = {}'.format(cvs))

print()

from sklearn.metrics import confusion_matrix
cf_mx = confusion_matrix(target, y_pred)

print('Confusion Matrix = \n {}'.format(cf_mx))

print()

#total
total = data.shape[0]
print('Total = {}'.format(total))

#total yes 
total_yes = (target == y_pred).sum()
print('Total Yes = {}'.format(total_yes))

#total no
total_no = (target != y_pred).sum()
print('Total No = {}'.format(total_no))

print()

# Total Actual - Yes
actual_yes = (target == True).sum()
print('Actual Yes = {}'.format(actual_yes))

# Total Actual - No
actual_no = (target == False).sum()
print('Actual No = {}'.format(actual_no))

# Total Prediction - Yes
predicted_yes = (y_pred == True).sum()
print('Predicted Yes = {}'.format(predicted_yes))

# Total Prediction - No
predicted_no = (y_pred == False).sum()
print('Predicted No = {}'.format(predicted_no))

print()

#true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
tp = np.logical_and(target == True, target == y_pred).sum()
print('TP = {}'.format(tp))

#true negatives (TN): We predicted no, and they don't have the disease.
tn = np.logical_and(target == False, target == y_pred).sum()
print('TN = {}'.format(tn))

#false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
fp = np.logical_and(target == False, target != y_pred).sum()
print('FP = {}'.format(fp))

#false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")
fn = np.logical_and(target == True, target != y_pred).sum()
print('FN = {}'.format(fn))

print()

#Accuracy: Overall, how often is the classifier correct?
#(TP+TN)/total 
accuracy = (tp + tn)/total
print('Accuracy = {:.{prec}}'.format(accuracy, prec=2))

#Misclassification Rate ("Error Rate"): Overall, how often is it wrong? (equivalent to 1 minus Accuracy) 
#(FP+FN)/total 
error_rate = (fp + fn)/ total
print('Error Rate = {:.{prec}}'.format(error_rate, prec=2))

#True Positive Rate ("Sensitivity"): When it's actually yes, how often does it predict yes? (also known as "Recall")
#TP/actual yes 
sensitivity = tp / actual_yes
print('Sensitivity = {:.{prec}}'.format(sensitivity, prec=2))

#False Positive Rate: When it's actually no, how often does it predict yes?
#FP/actual no 
false_positive = fp / actual_no
print('False Positive Rate = {:.{prec}}'.format(false_positive, prec=2))

#True Negative Rate("Specificity"): When it's actually no, how often does it predict no? (equivalent to 1 minus False Positive Rate)
#TN/actual no 
specificity = tn / actual_no
print('Specificity = {:.{prec}}'.format(specificity, prec=2))

#Precision: When it predicts yes, how often is it correct?
#TP/predicted yes 
precision = tp / predicted_yes
print('Precision = {:.{prec}}'.format(precision, prec=2))

#Prevalence: How often does the yes condition actually occur in our sample?
#actual yes/total 
prevalence = actual_yes / total
print('Prevalance = {:.{prec}}'.format(prevalence, prec=2))

Cross Validation Scores = [0.64 0.69 0.69 0.73 0.62 0.61 0.67 0.66 0.65 0.69]

Confusion Matrix = 
 [[458  42]
 [289 211]]

Total = 1000
Total Yes = 669
Total No = 331

Actual Yes = 500
Actual No = 500
Predicted Yes = 253
Predicted No = 747

TP = 211
TN = 458
FP = 42
FN = 289

Accuracy = 0.67
Error Rate = 0.33
Sensitivity = 0.42
False Positive Rate = 0.084
Specificity = 0.92
Precision = 0.83
Prevalance = 0.5


***
# Fifth Run (Yelp Reviews)

Adding 'delicious' and 'tasty'

In [54]:
# Import the raw data.
df = pd.read_csv('/Users/Kevin/Files/Thinkful/Data Files/sentiment labelled sentences/yelp_labelled.txt', sep="	", header=None)
df.columns = ["review", "posneg"]

# creating pos and neg columns
df['pos'] = (df['posneg'] == 1)
df['neg'] = (df['posneg'] == 0)

# making reviews lowercase to create keyword features columns
df['review'].str.lower();

#Adding keyword features
# other keywords: ('wow', 'love')
pos_keywords = ['good', 'amazing', 'great', 'recommend', 'best', 'excellent', 'correct', 'yes', 'delicious', 'tasty']

#creating each keyword feature
for key in pos_keywords:
    df[str(key)] = df.review.str.contains(str(key),case=False)
    
#Variables for model
data = df[pos_keywords]
target = df['pos']

from sklearn.naive_bayes import BernoulliNB
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()
# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 331


# Evaluation

In [60]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(bnb, data, target, cv=10)
print('Cross Validation Scores = {}'.format(cvs))

print()

from sklearn.metrics import confusion_matrix
cf_mx = confusion_matrix(target, y_pred)

print('Confusion Matrix = \n {}'.format(cf_mx))

print()

#total
total = data.shape[0]
print('Total = {}'.format(total))

#total yes 
total_yes = (target == y_pred).sum()
print('Total Yes = {}'.format(total_yes))

#total no
total_no = (target != y_pred).sum()
print('Total No = {}'.format(total_no))

print()

# Total Actual - Yes
actual_yes = (target == True).sum()
print('Actual Yes = {}'.format(actual_yes))

# Total Actual - No
actual_no = (target == False).sum()
print('Actual No = {}'.format(actual_no))

# Total Prediction - Yes
predicted_yes = (y_pred == True).sum()
print('Predicted Yes = {}'.format(predicted_yes))

# Total Prediction - No
predicted_no = (y_pred == False).sum()
print('Predicted No = {}'.format(predicted_no))

print()

#true positives (TP): These are cases in which we predicted yes (they have the disease), and they do have the disease.
tp = np.logical_and(target == True, target == y_pred).sum()
print('TP = {}'.format(tp))

#true negatives (TN): We predicted no, and they don't have the disease.
tn = np.logical_and(target == False, target == y_pred).sum()
print('TN = {}'.format(tn))

#false positives (FP): We predicted yes, but they don't actually have the disease. (Also known as a "Type I error.")
fp = np.logical_and(target == False, target != y_pred).sum()
print('FP = {}'.format(fp))

#false negatives (FN): We predicted no, but they actually do have the disease. (Also known as a "Type II error.")
fn = np.logical_and(target == True, target != y_pred).sum()
print('FN = {}'.format(fn))

print()

#Accuracy: Overall, how often is the classifier correct?
#(TP+TN)/total 
accuracy = (tp + tn)/total
print('Accuracy = {:.{prec}}'.format(accuracy, prec=2))

#Misclassification Rate ("Error Rate"): Overall, how often is it wrong? (equivalent to 1 minus Accuracy) 
#(FP+FN)/total 
error_rate = (fp + fn)/ total
print('Error Rate = {:.{prec}}'.format(error_rate, prec=2))

#True Positive Rate ("Sensitivity"): When it's actually yes, how often does it predict yes? (also known as "Recall")
#TP/actual yes 
sensitivity = tp / actual_yes
print('Sensitivity = {:.{prec}}'.format(sensitivity, prec=2))

#False Positive Rate: When it's actually no, how often does it predict yes?
#FP/actual no 
false_positive = fp / actual_no
print('False Positive Rate = {:.{prec}}'.format(false_positive, prec=2))

#True Negative Rate("Specificity"): When it's actually no, how often does it predict no? (equivalent to 1 minus False Positive Rate)
#TN/actual no 
specificity = tn / actual_no
print('Specificity = {:.{prec}}'.format(specificity, prec=2))

#Precision: When it predicts yes, how often is it correct?
#TP/predicted yes 
precision = tp / predicted_yes
print('Precision = {:.{prec}}'.format(precision, prec=2))

#Prevalence: How often does the yes condition actually occur in our sample?
#actual yes/total 
prevalence = actual_yes / total
print('Prevalance = {:.{prec}}'.format(prevalence, prec=2))

Cross Validation Scores = [0.64 0.69 0.69 0.73 0.62 0.61 0.67 0.66 0.65 0.69]

Confusion Matrix = 
 [[458  42]
 [289 211]]

Total = 1000
Total Yes = 669
Total No = 331

Actual Yes = 500
Actual No = 500
Predicted Yes = 253
Predicted No = 747

TP = 211
TN = 458
FP = 42
FN = 289

Accuracy = 0.67
Error Rate = 0.33
Sensitivity = 0.42
False Positive Rate = 0.084
Specificity = 0.92
Precision = 0.83
Prevalance = 0.5


***
# Overview

Through each iteration except for the fourth, the accuracy and the sensitivity of the model improved. However the specificity of the model decreased slightly over each iteration. 

Accuracy_1 = .57 <br>
Accuracy_2 = .63 <br>
Accuracy_3 = .65 <br>
Accuracy_4 = .65 <br>
Accuracy_5 = .67 <br>

Sensitivity_1 = .18 <br>
Sensitivity_2 = .32 <br>
Sensitivity_3 = .37 <br>
Sensitivity_4 = .37 <br>
Sensitivity_5 = .42 <br>

Specificity_1 = .95 <br>
Specificity_1 = .94 <br>
Specificity_1 = .92 <br>
Specificity_1 = .92 <br>
Specificity_1 = .92 <br>

Do any of your classifiers seem to overfit?
- I do not believe any of the classifiers overfit. After looking at each cross validation they are all generally in-line.

Which seem to perform the best? Why?
- The second run performed the best with the accuracy increasing by 6% and sensitivity increasing by 14%. Adding "great" and "recommend" improved the scores.

Which features seemed to be most impactful to performance?
- Adding "great" and "recommend" improved the scores.