# Loading the data

In [49]:
%matplotlib inline

import sqlite3
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

In [50]:
con = sqlite3.connect('database.sqlite')

messages = pd.read_sql_query("""
SELECT Score, Summary
FROM Reviews
WHERE Score != 3
""", con)

In [51]:
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

In [52]:
Score = messages['Score']
Score = Score.map(partition)
Summary = messages['Summary']

# Brief Exploratory analysis

In [54]:
print(messages.head(20))

    Score                                            Summary
0       5                              Good Quality Dog Food
1       1                                  Not as Advertised
2       4                              "Delight" says it all
3       2                                     Cough Medicine
4       5                                        Great taffy
5       4                                         Nice Taffy
6       5      Great!  Just as good as the expensive brands!
7       5                             Wonderful, tasty taffy
8       5                                         Yay Barley
9       5                                   Healthy Dog Food
10      5                    The Best Hot Sauce in the World
11      5  My cats LOVE this "diet" food better than thei...
12      1               My Cats Are Not Fans of the New Food
13      4                                  fresh and greasy!
14      5                       Strawberry Twizzlers - Yummy
15      5           Lots

In [55]:
tmp = messages
tmp['Score'] = tmp['Score'].map(partition)
print(tmp.head(20))

       Score                                            Summary
0   positive                              Good Quality Dog Food
1   negative                                  Not as Advertised
2   positive                              "Delight" says it all
3   negative                                     Cough Medicine
4   positive                                        Great taffy
5   positive                                         Nice Taffy
6   positive      Great!  Just as good as the expensive brands!
7   positive                             Wonderful, tasty taffy
8   positive                                         Yay Barley
9   positive                                   Healthy Dog Food
10  positive                    The Best Hot Sauce in the World
11  positive  My cats LOVE this "diet" food better than thei...
12  negative               My Cats Are Not Fans of the New Food
13  positive                                  fresh and greasy!
14  positive                       Straw

In [56]:
review_list=[]
for review in tmp['Summary']:
    review=review.lower()
    review_list.append(review)
    #print(review)
    

In [57]:
tmp=tmp.drop('Summary',axis=1)

In [58]:
tmp['Summary']=review_list

In [59]:
tmp.head(5)

Unnamed: 0,Score,Summary
0,positive,good quality dog food
1,negative,not as advertised
2,positive,"""delight"" says it all"
3,negative,cough medicine
4,positive,great taffy


# Cleaning the data

To format our data and build the Term-doc incidence matrix, many operations will be performed on the data :

    * Stemming
    * Stop words removal
    * Lowering
    * Tokenization
    * Pruning (numbers and punctuation)



In [60]:
from nltk.corpus import stopwords
from string import punctuation
punc = set(punctuation)
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
from nltk.tokenize import wordpunct_tokenize

In [61]:
tokenized = []
for review in tmp['Summary']:
    
    tokens = nltk.word_tokenize(review)
    
    tokenized.append(tokens)

In [62]:
tokenized[1:10]

[['not', 'as', 'advertised'],
 ['``', 'delight', "''", 'says', 'it', 'all'],
 ['cough', 'medicine'],
 ['great', 'taffy'],
 ['nice', 'taffy'],
 ['great', '!', 'just', 'as', 'good', 'as', 'the', 'expensive', 'brands', '!'],
 ['wonderful', ',', 'tasty', 'taffy'],
 ['yay', 'barley'],
 ['healthy', 'dog', 'food']]

In [63]:
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}


In [64]:
stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en = set(stopwords.words('english'))
stopwords_punct = set(punctuation)
spe = ["''","``"]
spe = set(spe)
# Combine the stopwords. Its a lot longer so I'm not printing it out...
stop_pun = set.union(stopwords_json_en, stopwords_nltk_en, stopwords_punct,spe)

In [65]:
s = []
for ele in tokenized:
    for i in ele:
        if i in stop_pun:
            ele.remove(i)
        if type(i)==type(5) or type(i)==type(5.0):
            print(i)
            ele.remove(i)
            
            


In [66]:
tmp['Summary'] = tokenized

In [67]:
tmp.head()

Unnamed: 0,Score,Summary
0,positive,"[good, quality, dog, food]"
1,negative,"[as, advertised]"
2,positive,"[delight, says, all]"
3,negative,"[cough, medicine]"
4,positive,"[great, taffy]"


In [68]:
X_train, X_test, y_train, y_test = train_test_split(Summary, Score, test_size=0.2, random_state=42)

In [80]:
count_vect = CountVectorizer()
X_train_final = count_vect.fit_transform(X_train)  
X_test_final = count_vect.transform(X_test)



tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_final)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_final)



# Applying Multinomial Naïve Bayes learning method¶

In [82]:
prediction = dict()
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(X_train_tfidf, y_train)
prediction['Multinomial'] = model.predict(X_test_tfidf)

In [86]:
len(prediction['Multinomial'])

105163

In [87]:
len(y_test)

105163

# Before we test our classifier on the test set, we get a sense of how good it is on the validation set.

In [88]:
from sklearn.metrics import accuracy_score

print('Accuracy = {}'.format(
        accuracy_score(prediction['Multinomial'], y_test) * 100)
     )

Accuracy = 90.83993419738881
