In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from scipy.stats import ttest_ind
from scipy.stats import zscore
from scipy.stats.mstats import winsorize
from sqlalchemy import create_engine
import warnings

from scipy.stats import jarque_bera
from scipy.stats import normaltest
from scipy.stats import boxcox
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import power_transform
from sklearn.preprocessing import quantile_transform


warnings.filterwarnings('ignore')
sns.set(style="whitegrid")


In [2]:
data_amazon = 'https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/sentiment_labelled_sentences/amazon_cells_labelled.txt'''
amazon_raw = pd.read_csv(data_amazon, delimiter='\t', header=None)
amazon_raw.columns = ['sentence', 'sentiment']
display(amazon_raw.head(20))
amazon_raw.dtypes

Unnamed: 0,sentence,sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1
5,I have to jiggle the plug to get it to line up...,0
6,If you have several dozen or several hundred c...,0
7,If you are Razr owner...you must have this!,1
8,"Needless to say, I wasted my money.",0
9,What a waste of money and time!.,0


sentence     object
sentiment     int64
dtype: object

In [3]:
def get_average_sentence_length(text):
    new_text = text.replace("!", ".")
    new_text = text.replace("?", ".")
    sentences = new_text.split(".")
    sentences_in_text = []
    for word in sentences:
        sentences_in_text.append(word.count(" ") + 1)
    average_length = sum(sentences_in_text) / len(sentences_in_text)
    return average_length



amazon_raw['sentence_length'] = amazon_raw['sentence'].apply(lambda x: get_average_sentence_length(x))
amazon_raw.groupby('sentiment').sentence_length.median()
amazon_raw['bad_length'] = (amazon_raw.sentence_length > 5) & (amazon_raw.sentence_length < 10)

In [4]:
keywords_bad = ['bad', 'terrible', 'hated', 'worst', 'awful', 'fooled', 'not good', 'waste', 'do not' ]
keywords_good = ['great', 'good', 'fantastic', 'wonderful', 'happy']

for key in keywords_bad:
    amazon_raw[str(key)] = amazon_raw.sentence.str.contains(
        ' ' + str(key) + ' ',
        case=False
)
    
#for word in amazon_raw.sentence:
 #   upper_case = []
  #  if word == amazon_raw.sentence.str.isupper(): #help with this
   #     upper_case.append(word)
    
amazon_raw['allcaps'] = amazon_raw.sentence.str.isupper()
amazon_raw['sentiment_bad'] = (amazon_raw['sentiment'] == 0)
amazon_raw['sentiment_good'] = (amazon_raw['sentiment'] == 1)

data = amazon_raw[keywords_bad + ['allcaps', 'sentence_length', 'bad_length']]
target = amazon_raw['sentiment_bad']

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
y_pred = bnb.fit(data, target).predict(data)  

In [5]:
print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points : 448


In [6]:
from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=20)
print('With 20% Holdout: ' + str(bnb.fit(X_train, y_train).score(X_test, y_test)))
print('Testing on Sample: ' + str(bnb.fit(data, target).score(data, target)))

With 20% Holdout: 0.595
Testing on Sample: 0.552


In [7]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.59, 0.53, 0.52, 0.57, 0.54, 0.57, 0.54, 0.54, 0.59, 0.53])

In [8]:
classifier = BernoulliNB()
classifier.fit(X_train, y_train)


# Predicting the Test set results
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[74 31]
 [50 45]]


In [9]:
amazon_raw.groupby('sentiment').sentence_length.describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,500.0,5.738844,3.562771,1.0,3.0,5.0,8.0,26.0
1,500.0,5.657171,3.792783,1.0,2.5,4.5,7.5,25.0


In [10]:
amazon_raw['sentence_length_bad'] = (amazon_raw['sentence_length'] >= 2.5) & (amazon_raw['sentence_length'] <= 7.5)

In [11]:

print(amazon_raw.head())


                                            sentence  sentiment  \
0  So there is no way for me to plug it in here i...          0   
1                        Good case, Excellent value.          1   
2                             Great for the jawbone.          1   
3  Tied to charger for conversations lasting more...          0   
4                                  The mic is great.          1   

   sentence_length  bad_length    bad  terrible  hated  worst  awful  fooled  \
0             11.0       False  False     False  False  False  False   False   
1              2.5       False  False     False  False  False  False   False   
2              2.5       False  False     False  False  False  False   False   
3              6.0        True  False     False  False  False  False   False   
4              2.5       False  False     False  False  False  False   False   

   not good  waste  do not  allcaps  sentiment_bad  sentiment_good  \
0     False  False   False    False           

In [12]:
keywords = ['if', 'bad', 'terrible', 'hated', 'worst', 'awful', 'fooled', 'not good', 'waste', 'do not',
           'great', 'good', 'wonderful', 'excellent', 'perfect']


for key in keywords:
    amazon_raw[str(key)] = amazon_raw.sentence.str.contains(
        ' ' + str(key) + ' ',
        case=False
)
    
#for word in amazon_raw.sentence:
 #   upper_case = []
  #  if word == amazon_raw.sentence.str.isupper(): #help with this
   #     upper_case.append(word)
    
amazon_raw['allcaps'] = amazon_raw.sentence.str.isupper()

data = amazon_raw[keywords + ['allcaps', 'sentence_length']]
target = amazon_raw['sentiment']

from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
y_pred = gaussian.fit(data, target).predict(data)  

In [13]:
#looking at a gaussian model prediction
print("Number of mislabeled points out of a total {} points (gaussian) : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points (gaussian) : 457


In [14]:
#same features with a bernoulli dist
y_pred = bnb.fit(data, target).predict(data) 
print("Number of mislabeled points out of a total {} points (Bernoulli) : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points (Bernoulli) : 436


In [15]:
good = pd.read_csv('/Users/richardlafferty/text_good_sentiment.txt', sep='\n', header=None)
bad = pd.read_csv('/Users/richardlafferty/bad_text_sentiment.txt', sep='\n', header=None)


In [16]:
#print(list(good.iloc[:,0].values))
good_list = list(good.iloc[:,0].values)
bad_list = list(bad.iloc[:,0].values)

good_bad_list = bad_list + [i for i in good_list if i not in bad_list]
good_bad_list = [word for word in good_bad_list if word.isalnum()]
print(good_bad_list)



In [17]:
#I will check to see that when using a list found from the internet of positive sentiments
#how it'll work with the model

#amazon_raw.sentence = amazon_raw.sentence.str.islower()

for key in good_bad_list:
    amazon_raw[str(key)] = amazon_raw.sentence.astype(str).str.contains(
        ' ' + str(key) + ' ',
        case=False
)


    
data = amazon_raw[good_bad_list]
target = amazon_raw.sentiment

from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()

y_pred = gaussian.fit(data, target).predict(data)

print("Number of mislabeled points out of a total {} points (Gaussian): {}".format(
    data.shape[0],
    (target != y_pred).sum()
))

Number of mislabeled points out of a total 1000 points (Gaussian): 336


In [22]:
#same as above using Bernoulli instead
for key in good_bad_list:
    amazon_raw[str(key)] = amazon_raw.sentence.astype(str).str.contains(
        ' ' + str(key) + ' ',
        case=False
)


    
data = amazon_raw[good_bad_list]
target = amazon_raw.sentiment

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

y_pred = bnb.fit(data, target).predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))


Number of mislabeled points out of a total 1000 points : 249


In [23]:
#adding my features from above to see if there's any change in performance
for key in good_bad_list:
    amazon_raw[str(key)] = amazon_raw.sentence.astype(str).str.contains(
        ' ' + str(key) + ' ',
        case=False
)


    
data = amazon_raw[good_bad_list + ['allcaps', 'sentence_length']]
target = amazon_raw.sentiment

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

y_pred = bnb.fit(data, target).predict(data)

print("Number of mislabeled points out of a total {} points : {}".format(
    data.shape[0],
    (target != y_pred).sum()
))


Number of mislabeled points out of a total 1000 points : 250


Ok. Now I think that we are seeing relatively reasonable sentiment detection. Using a massive list of good and bad words we are getting just over 75% accuracy on our data. I will now look at our confusion matrix and see where we're missing the other 25% 

In [20]:
from sklearn.model_selection import train_test_split
# Use train_test_split to create the necessary training and test groups
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)
y_pred = bnb.predict(X_test)
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[88  7]
 [45 60]]


looking at this matrix we can see that we are very good at predicting true positives with very few false negatives. We are not doing as well with determining true negatives. 

In [21]:
from sklearn.model_selection import cross_val_score
cross_val_score(bnb, data, target, cv=10)

array([0.68, 0.66, 0.69, 0.68, 0.72, 0.67, 0.72, 0.67, 0.69, 0.66])

So there was progress made in making a better sentiment analysis algorithm, but there is still room for improvement. especially in making sure we can determine our true negatives