In [1]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

%matplotlib inline

In [2]:
# Load and process amazon data
data_path = ('/Users/Lisa/Thinkful_Data_Science_Projects/Module 17/sentiment labelled sentences/amazon_cells_labelled.txt')

amazon = pd.read_csv(data_path, delimiter= '\t', header=None)
amazon.columns = ['message', 'positive']

In [3]:
# Turn positive column into a boolean so we can do some statistics
# to prepare for modeling
amazon['positive'] = (amazon['positive'] == 1)
# Note that if you run this cell a second time everything will
# become false, so don't

In [4]:
# Check data
amazon.head()

Unnamed: 0,message,positive
0,So there is no way for me to plug it in here i...,False
1,"Good case, Excellent value.",True
2,Great for the jawbone.,True
3,Tied to charger for conversations lasting more...,False
4,The mic is great.,True


## Version 1
8 positive keywords. Includes spaces to get words rather than pattern matching.

In [5]:
# Add feature columns to data frame
pos_keywords = ['great', 'good', 'excellent', 'nice', 'best', 'want', 'easy', 'bargain']

for key in pos_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [6]:
amazon.head()

Unnamed: 0,message,positive,great,good,excellent,nice,best,want,easy,bargain
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,True,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False


In [7]:
# Specify inputs (x or independent variables) and outputs (y or dependent variable)
data = amazon[pos_keywords]
target = amazon['positive']

In [8]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable
y_pred = bnb.predict(data)

# Display our results
conf_mat = confusion_matrix(target, y_pred)
sensitivity = round(conf_mat[1][1] / conf_mat[1].sum() * 100, 2)
specificity = round(conf_mat[0][0] / conf_mat[0].sum() * 100, 2)
class_imb = round(conf_mat[1].sum() / (conf_mat[0].sum() + conf_mat[1].sum()) * 100, 2)

print('Number of mislabeled points out of a total {} points: {}'.format(
    data.shape[0],
    (target != y_pred).sum()
))
print('Accuracy: {}%'.format(100 - round((target != y_pred).sum() / data.shape[0] * 100, 2)))
print('Class Imbalance: {}%'.format(class_imb))
print('Cross Validation Score: {}'.format(cross_val_score(bnb, data, target, cv=8)))
print('Confusion matrix: {}, \n\t\t  {}'.format(conf_mat[0], conf_mat[1]))
print('Sensitivity: {}%\nSpecificity: {}%'.format(sensitivity, specificity))

Number of mislabeled points out of a total 1000 points: 409
Accuracy: 59.1%
Class Imbalance: 50.0%
Cross Validation Score: [0.61111111 0.5952381  0.5952381  0.57142857 0.58064516 0.60483871
 0.59677419 0.57258065]
Confusion matrix: [484  16], 
		  [393 107]
Sensitivity: 21.4%
Specificity: 96.8%


## Version 2
8 positive keywords. No spaces to get pattern matching rather than words.

In [9]:
# Add feature columns to data frame
pos_keywords = ['great', 'good', 'excellent', 'nice', 'best', 'want', 'easy', 'bargain']

for key in pos_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        str(key),
        case=False
    )

In [10]:
amazon.head()

Unnamed: 0,message,positive,great,good,excellent,nice,best,want,easy,bargain
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,True,True,False,False,False,False,False
2,Great for the jawbone.,True,True,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,True,False,False,False,False,False,False,False


In [11]:
# Specify inputs (x or independent variables) and outputs (y or dependent variable)
data = amazon[pos_keywords]
target = amazon['positive']

In [12]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable
y_pred = bnb.predict(data)

# Display our results
conf_mat = confusion_matrix(target, y_pred)
sensitivity = round(conf_mat[1][1] / conf_mat[1].sum() * 100, 2)
specificity = round(conf_mat[0][0] / conf_mat[0].sum() * 100, 2)
class_imb = round(conf_mat[1].sum() / (conf_mat[0].sum() + conf_mat[1].sum()) * 100, 2)

print('Number of mislabeled points out of a total {} points: {}'.format(
    data.shape[0],
    (target != y_pred).sum()
))
print('Accuracy: {}%'.format(100 - round((target != y_pred).sum() / data.shape[0] * 100, 2)))
print('Class Imbalance: {}%'.format(class_imb))
print('Cross Validation Score: {}'.format(cross_val_score(bnb, data, target, cv=8)))
print('Confusion matrix: {}, \n\t\t  {}'.format(conf_mat[0], conf_mat[1]))
print('Sensitivity: {}%\nSpecificity: {}%'.format(sensitivity, specificity))

Number of mislabeled points out of a total 1000 points: 304
Accuracy: 69.6%
Class Imbalance: 50.0%
Cross Validation Score: [0.75396825 0.72222222 0.74603175 0.65873016 0.64516129 0.66935484
 0.7016129  0.66935484]
Confusion matrix: [474  26], 
		  [278 222]
Sensitivity: 44.4%
Specificity: 94.8%


## Version 3
19 positive keywords. No spaces to get pattern matching rather than words.

In [13]:
# Add feature columns to data frame
pos_keywords = ['great', 'good', 'excellent', 'nice', 'ideal', 'well', 'impress' ,'must',
            'recommend', 'best', 'want', 'reasonable', 'happy', 'no complaints', 'cool',
            'beautiful', 'easy', 'bargain', 'fast']

for key in pos_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        str(key),
        case=False
    )

In [14]:
amazon.head()

Unnamed: 0,message,positive,great,good,excellent,nice,best,want,easy,bargain,...,well,impress,must,recommend,reasonable,happy,no complaints,cool,beautiful,fast
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [15]:
# Specify inputs (x or independent variables) and outputs (y or dependent variable)
data = amazon[pos_keywords]
target = amazon['positive']

In [16]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable
y_pred = bnb.predict(data)

# Display our results
conf_mat = confusion_matrix(target, y_pred)
sensitivity = round(conf_mat[1][1] / conf_mat[1].sum() * 100, 2)
specificity = round(conf_mat[0][0] / conf_mat[0].sum() * 100, 2)
class_imb = round(conf_mat[1].sum() / (conf_mat[0].sum() + conf_mat[1].sum()) * 100, 2)

print('Number of mislabeled points out of a total {} points: {}'.format(
    data.shape[0],
    (target != y_pred).sum()
))
print('Accuracy: {}%'.format(100 - round((target != y_pred).sum() / data.shape[0] * 100, 2)))
print('Class Imbalance: {}%'.format(class_imb))
print('Cross Validation Score: {}'.format(cross_val_score(bnb, data, target, cv=8)))
print('Confusion matrix: {}, \n\t\t  {}'.format(conf_mat[0], conf_mat[1]))
print('Sensitivity: {}%\nSpecificity: {}%'.format(sensitivity, specificity))

Number of mislabeled points out of a total 1000 points: 261
Accuracy: 73.9%
Class Imbalance: 50.0%
Cross Validation Score: [0.80952381 0.73809524 0.72222222 0.73809524 0.67741935 0.69354839
 0.75       0.72580645]
Confusion matrix: [453  47], 
		  [214 286]
Sensitivity: 57.2%
Specificity: 90.6%


## Version 4
19 positive keywords. No spaces to get pattern matching rather than words.
11 negative keywords. No spaces to get pattern matching rather than words.

In [17]:
# Add feature columns to data frame
pos_keywords = ['great', 'good', 'excellent', 'nice', 'ideal', 'well', 'impress' ,'must',
            'recommend', 'best', 'want', 'reasonable', 'happy', 'no complaints', 'cool',
            'beautiful', 'easy', 'bargain', 'fast']

for key in pos_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        str(key),
        case=False
    )
    
neg_keywords = ['bad', 'terrible', 'waste', 'unusable', 'garbage', 'worthless', 'return',
                'beware', 'broke', 'junk', 'warning']

for key in neg_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        str(key),
        case=False
    )

In [18]:
amazon.head()

Unnamed: 0,message,positive,great,good,excellent,nice,best,want,easy,bargain,...,terrible,waste,unusable,garbage,worthless,return,beware,broke,junk,warning
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [19]:
# Specify inputs (x or independent variables) and outputs (y or dependent variable)
data = amazon[pos_keywords + neg_keywords]
target = amazon['positive']

In [20]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable
y_pred = bnb.predict(data)

# Display our results
conf_mat = confusion_matrix(target, y_pred)
sensitivity = round(conf_mat[1][1] / conf_mat[1].sum() * 100, 2)
specificity = round(conf_mat[0][0] / conf_mat[0].sum() * 100, 2)
class_imb = round(conf_mat[1].sum() / (conf_mat[0].sum() + conf_mat[1].sum()) * 100, 2)

print('Number of mislabeled points out of a total {} points: {}'.format(
    data.shape[0],
    (target != y_pred).sum()
))
print('Accuracy: {}%'.format(100 - round((target != y_pred).sum() / data.shape[0] * 100, 2)))
print('Class Imbalance: {}%'.format(class_imb))
print('Cross Validation Score: {}'.format(cross_val_score(bnb, data, target, cv=8)))
print('Confusion matrix: {}, \n\t\t  {}'.format(conf_mat[0], conf_mat[1]))
print('Sensitivity: {}%\nSpecificity: {}%'.format(sensitivity, specificity))

Number of mislabeled points out of a total 1000 points: 257
Accuracy: 74.3%
Class Imbalance: 50.0%
Cross Validation Score: [0.81746032 0.73809524 0.72222222 0.74603175 0.67741935 0.69354839
 0.75       0.72580645]
Confusion matrix: [457  43], 
		  [214 286]
Sensitivity: 57.2%
Specificity: 91.4%


## Version 5
19 positive keywords. No spaces to get pattern matching rather than words.
23 negative keywords. No spaces to get pattern matching rather than words.

In [21]:
# Add feature columns to data frame
pos_keywords = ['great', 'good', 'excellent', 'nice', 'ideal', 'well', 'impress' ,'must',
            'recommend', 'best', 'want', 'reasonable', 'happy', 'no complaints', 'cool',
            'beautiful', 'easy', 'bargain', 'fast']

for key in pos_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        str(key),
        case=False
    )
    
neg_keywords = ['bad', 'terrible', 'waste', 'problem', "doesn't", 'unusable', 'garbage',
               'poor', 'worthless', 'return', 'disappoint', 'drawback', 'do not buy', 'beware',
               'not', 'broke', 'flaw', 'hate', 'useless', 'worst', 'junk', 'mistake', 'warning']

for key in neg_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        str(key),
        case=False
    )

In [22]:
amazon.head()

Unnamed: 0,message,positive,great,good,excellent,nice,best,want,easy,bargain,...,poor,disappoint,drawback,do not buy,not,flaw,hate,useless,worst,mistake
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:
# Specify inputs (x or independent variables) and outputs (y or dependent variable)
data = amazon[pos_keywords + neg_keywords]
target = amazon['positive']

In [24]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable
y_pred = bnb.predict(data)

# Display our results
conf_mat = confusion_matrix(target, y_pred)
sensitivity = round(conf_mat[1][1] / conf_mat[1].sum() * 100, 2)
specificity = round(conf_mat[0][0] / conf_mat[0].sum() * 100, 2)
class_imb = round(conf_mat[1].sum() / (conf_mat[0].sum() + conf_mat[1].sum()) * 100, 2)

print('Number of mislabeled points out of a total {} points: {}'.format(
    data.shape[0],
    (target != y_pred).sum()
))
print('Accuracy: {}%'.format(100 - round((target != y_pred).sum() / data.shape[0] * 100, 2)))
print('Class Imbalance: {}%'.format(class_imb))
print('Cross Validation Score: {}'.format(cross_val_score(bnb, data, target, cv=8)))
print('Confusion matrix: {}, \n\t\t  {}'.format(conf_mat[0], conf_mat[1]))
print('Sensitivity: {}%\nSpecificity: {}%'.format(sensitivity, specificity))

Number of mislabeled points out of a total 1000 points: 238
Accuracy: 76.2%
Class Imbalance: 50.0%
Cross Validation Score: [0.81746032 0.75396825 0.73015873 0.77777778 0.7016129  0.72580645
 0.76612903 0.73387097]
Confusion matrix: [475  25], 
		  [213 287]
Sensitivity: 57.4%
Specificity: 95.0%


## Version 6
19 positive keywords. Includes spaces to get words rather than pattern matching.
23 negative keywords. Includes spaces to get words rather than pattern matching.

In [25]:
# Add feature columns to data frame
pos_keywords = ['great', 'good', 'excellent', 'nice', 'ideal', 'well', 'impress' ,'must',
            'recommend', 'best', 'want', 'reasonable', 'happy', 'no complaints', 'cool',
            'beautiful', 'easy', 'bargain', 'fast']

for key in pos_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )
    
neg_keywords = ['bad', 'terrible', 'waste', 'problem', "doesn't", 'unusable', 'garbage',
               'poor', 'worthless', 'return', 'disappoint', 'drawback', 'do not buy', 'beware',
               'not', 'broke', 'flaw', 'hate', 'useless', 'worst', 'junk', 'mistake', 'warning']

for key in neg_keywords:
    amazon[str(key)] = amazon.message.str.contains(
        ' ' + str(key) + ' ',
        case=False
    )

In [26]:
amazon.head()

Unnamed: 0,message,positive,great,good,excellent,nice,best,want,easy,bargain,...,poor,disappoint,drawback,do not buy,not,flaw,hate,useless,worst,mistake
0,So there is no way for me to plug it in here i...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,"Good case, Excellent value.",True,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Great for the jawbone.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Tied to charger for conversations lasting more...,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,The mic is great.,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [27]:
# Specify inputs (x or independent variables) and outputs (y or dependent variable)
data = amazon[pos_keywords + neg_keywords]
target = amazon['positive']

In [28]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable
bnb = BernoulliNB()

# Fit our model to the data
bnb.fit(data, target)

# Classify, storing the result in a new variable
y_pred = bnb.predict(data)

# Display our results
conf_mat = confusion_matrix(target, y_pred)
sensitivity = round(conf_mat[1][1] / conf_mat[1].sum() * 100, 2)
specificity = round(conf_mat[0][0] / conf_mat[0].sum() * 100, 2)
class_imb = round(conf_mat[1].sum() / (conf_mat[0].sum() + conf_mat[1].sum()) * 100, 2)

print('Number of mislabeled points out of a total {} points: {}'.format(
    data.shape[0],
    (target != y_pred).sum()
))
print('Accuracy: {}%'.format(100 - round((target != y_pred).sum() / data.shape[0] * 100, 2)))
print('Class Imbalance: {}%'.format(class_imb))
print('Cross Validation Score: {}'.format(cross_val_score(bnb, data, target, cv=8)))
print('Confusion matrix: {}, \n\t\t  {}'.format(conf_mat[0], conf_mat[1]))
print('Sensitivity: {}%\nSpecificity: {}%'.format(sensitivity, specificity))

Number of mislabeled points out of a total 1000 points: 369
Accuracy: 63.1%
Class Imbalance: 50.0%
Cross Validation Score: [0.63492063 0.61904762 0.57142857 0.63492063 0.60483871 0.64516129
 0.63709677 0.58870968]
Confusion matrix: [145 355], 
		  [ 14 486]
Sensitivity: 97.2%
Specificity: 29.0%


## Evaluation
Once you've iterated, answer these questions to compare the performance of each:

1. Do any of your classifiers seem to overfit? Yes, versions 2-5 seem to overfit, with versions 3 and 4 being the worst (greatest variability in cross validation accuracy scores).


2. Which seem to perform the best? Why? Model 5 appears to perform the best. Although there is some overfitting, it demonstrates the highest overall accuracy, sensitivity (percentage of positives correctly identified), and specificity (percentage of negatives correctly identified).


3. Which features seemed to be most impactful to performance? Removing the spaces from the positive keywords (pattern recognition rather than words) seemed to have the most beneficial impact on accuracy and sensitivity. Adding more positive and negative keywords also had a beneficial impact on accuracy, sensitivity, and specificity; however, this impact may be largely biased by a few really good keywords (quality rather than quantity). This observation raises the possibility that a word count may be beneficial in identifying the best keywords (rather than just a cursory glance at the messages).