In [168]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import random


## Classifier 1 -- 2000 positive words list

In [176]:
amazon = pd.read_csv('amazon.txt', delimiter= '\t', header=None)
#import list of words associated with positive sentiments
words = pd.read_csv('words.txt', delimiter = '\n', header=None)

#Create column labels
amazon.columns = ['text', 'sentiment']

#turn df to list and get rid of first word as it's casuing problems.
words = words[0].tolist()
keywords = words[1:]




In [177]:
for key in keywords:
    amazon[str(key)] = amazon.text.str.contains(
        str(key),
        case=False
    )

In [178]:

amazon['sentiment'] = (amazon['sentiment'] == 1)


In [172]:
data = amazon[keywords]
target = amazon['sentiment']


In [173]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 166
With 20% Holdout: 0.765
Testing on Sample: 0.834


In [174]:
print(cross_val_score(bnb, data, target, cv=10))
print(confusion_matrix(target, y_pred))

[ 0.84  0.81  0.84  0.77  0.79  0.72  0.72  0.73  0.85  0.74]
[[455  45]
 [121 379]]


455 predicted Bad/Actual Bad (True Negative)<br>
45  predicted Good/ Actual Bad (False Positive - Type 1)<br>
121 Predicted Bad/ Actual Good (False Negative - Type 2)<br>
379 Predicted Good/ Actual Good(True Positive)<br>
<br>
75% of positives indentified.<br>
91% of negatives indentified.<br>
<br>
This classifier has the highest accuracy rate, but suffers from some overfitting with a 12% difference in the highest and lowest group.

## Classifier 2 -- Shorter focused list

In [161]:
amazon = pd.read_csv('amazon.txt', delimiter= '\t', header=None)

#Create column labels
amazon.columns = ['text', 'sentiment']

keywords = ['great', 'awesome', 'like', 'happy', 'good', 'quality', 'amazing']


In [162]:
for key in keywords:
    amazon[str(key)] = amazon.text.str.contains(
        str(key),
        case=False
    )

amazon['sentiment'] = (amazon['sentiment'] == 1)
data = amazon[keywords]
target = amazon['sentiment']


In [163]:
# Our data is binary / boolean, so we're importing the Bernoulli classifier.
from sklearn.naive_bayes import BernoulliNB

# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 351
With 20% Holdout: 0.64
Testing on Sample: 0.649


In [164]:
print(cross_val_score(bnb, data, target, cv=10))
print(confusion_matrix(target, y_pred))

[ 0.67  0.65  0.68  0.67  0.64  0.69  0.61  0.59  0.64  0.62]
[[448  52]
 [299 201]]


448 predicted Bad/Actual Bad (True Negative)<br>
52  predicted Good/ Actual Bad (False Positive - Type 1)<br>
299 Predicted Bad/ Actual Good (False Negative - Type 2)<br>
201 Predicted Good/ Actual Good(True Positive)<br>
<br>
40% of positives indentified.<br>
87% of negatives indentified.<br>
<br>
<br>
I still get a large difference between groups, but the number of errors almost doubles. The increase comes mostly from type 2 errors.

## Classifier 3 -- Shorter word list with length feature

In [179]:
amazon = pd.read_csv('amazon.txt', delimiter= '\t', header=None)
#import list of words associated with positive sentiment

#Create column labels
amazon.columns = ['text', 'sentiment']

keywords = ['great', 'awesome', 'like', 'happy', 'good', 'quality', 'amazing']




In [165]:
for key in keywords:
    amazon[str(key)] = amazon.text.str.contains(
        str(key),
        case=False
    )

amazon['length'] = amazon['text'].apply(lambda x: True if len(x)<30 else False)
#amazon['length'] = amazon.apply(lambda x: True if len(str(x)< 12 else False))
#df['name_length']  = df['seller_name'].str.len()


amazon['sentiment'] = (amazon['sentiment'] == 1)
data = amazon[keywords +['length']]

target = amazon['sentiment']


In [166]:
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 357
With 20% Holdout: 0.635
Testing on Sample: 0.643


In [167]:
print(cross_val_score(bnb, data, target, cv=10))
print(confusion_matrix(target, y_pred))

[ 0.67  0.66  0.68  0.67  0.62  0.68  0.59  0.56  0.63  0.61]
[[459  41]
 [316 184]]


459 predicted Bad/Actual Bad (True Negative) <br>
41  predicted Good/ Actual Bad (False Positive - Type 1) <br>
316 Predicted Bad/ Actual Good (False Negative - Type 2)<br>
184 Predicted Good/ Actual Good(True Positive)<br>
<br>
37% of positives indentified.<br>
92% of negatives indentified.<br>
<br>
<br>
Introducing a length feature further increases type 2 errors and has the same overfitting issues.

## Classifier 4 -- Slimmed down word list from classifier 1

In [139]:
amazon = pd.read_csv('amazon.txt', delimiter= '\t', header=None)
#import list of words associated with positive sentiments
words = pd.read_csv('words.txt', delimiter = '\n', header=None)


#Create column labels
amazon.columns = ['text', 'sentiment']

#turn df to list and get rid of first word as it's casuing problems.
words = words[0].tolist()
keywords = words[1:]
random.shuffle(keywords)
keywords = keywords[:500]

for key in keywords:
    amazon[str(key)] = amazon.text.str.contains(
        str(key),
        case=False
    )
    
amazon['sentiment'] = (amazon['sentiment'] == 1)
data = amazon[keywords]

target = amazon['sentiment']


In [140]:
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 328
With 20% Holdout: 0.655
Testing on Sample: 0.672


In [141]:
print(cross_val_score(bnb, data, target, cv=10))
print(confusion_matrix(target, y_pred))

[ 0.68  0.62  0.63  0.66  0.66  0.63  0.59  0.58  0.69  0.63]
[[462  38]
 [290 210]]


462 predicted Bad/Actual Bad (True Negative)<br>
38  predicted Good/ Actual Bad (False Positive - Type 1)<br>
290 Predicted Bad/ Actual Good (False Negative - Type 2)<br>
210 Predicted Good/ Actual Good(True Positive)<br>
<br>
42% of positives indentified.<br>
92% of negatives indentified.<br>
<br>
<br>
I cut the original word list down to 500 randomly selected words from the initial 2000 word list. This performs better than the previous 2, but not as well as the initial classifier. We get a slight reduction in type 1 errors, with a large increase in type 2 errors. Overfitting is still a problem.

## Classifier 5

In [175]:
amazon = pd.read_csv('amazon.txt', delimiter= '\t', header=None)
#import list of words associated with positive sentiments
words = pd.read_csv('words.txt', delimiter = '\n', header=None)


#Create column labels
amazon.columns = ['text', 'sentiment']

#turn df to list and get rid of first word as it's casuing problems.
words = words[0].tolist()
keywords = words[1:]
random.shuffle(keywords)
keywords = keywords[:1000]

for key in keywords:
    amazon[str(key)] = amazon.text.str.contains(
        str(key),
        case=False
    )
    
amazon['sentiment'] = (amazon['sentiment'] == 1)
data = amazon[keywords]

target = amazon['sentiment']


In [151]:
# Instantiate our model and store it in a new variable.
bnb = BernoulliNB()

# Fit our model to the data.
bnb.fit(data, target)

# Classify, storing the result in a new variable.
y_pred = bnb.predict(data)

# Display our results.
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

Number of mislabeled points out of a total 1000 points : 225
With 20% Holdout: 0.725
Testing on Sample: 0.775


In [152]:
print(cross_val_score(bnb, data, target, cv=10))
print(confusion_matrix(target, y_pred))

[ 0.83  0.77  0.77  0.77  0.76  0.76  0.69  0.64  0.81  0.66]
[[462  38]
 [187 313]]


462 predicted Bad/Actual Bad (True Negative) <br>
38  predicted Good/Actual Bad (False Positive - Type 1)<br>
313 Predicted Bad/Actual Good (False Negative - Type 2)<br>
184 Predicted Good/Actual Good(True Positive)<br>
<br>
37% of positives indentified.<br>
92% of negatives indentified.<br>
<br>
A slight variation of classifier 4, cutting the original word list down to 1000 words. This performs better than all classifiers except the first. There is a slight reduction in type 1 errors, with a large increase in type 2 errors. Overfitting is still a problem.

## Conclusion

The first classifier was the most accurate and while it suffered from overfitting problems, the overfitting was present in all the other classifiers despite some of the classifiers being half as accurate. 