In [2]:
%matplotlib inline

In [4]:
import sqlite3
import numpy as np
import pandas as pd
import nltk
import string
import matplotlib.pyplot as plt

In [10]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from nltk.stem.porter import PorterStemmer

In [41]:
con = sqlite3.connect('d:\database.sqlite')

In [42]:
messages = pd.read_sql_query("""SELECT Score, Summary FROM Reviews WHERE Score != 3""", con)

In [43]:
print(messages.keys())
print(messages.head(10))
print(messages.describe())

Index(['Score', 'Summary'], dtype='object')
   Score                                        Summary
0      5                          Good Quality Dog Food
1      1                              Not as Advertised
2      4                          "Delight" says it all
3      2                                 Cough Medicine
4      5                                    Great taffy
5      4                                     Nice Taffy
6      5  Great!  Just as good as the expensive brands!
7      5                         Wonderful, tasty taffy
8      5                                     Yay Barley
9      5                               Healthy Dog Food
               Score
count  525814.000000
mean        4.279148
std         1.316725
min         1.000000
25%         4.000000
50%         5.000000
75%         5.000000
max         5.000000


In [44]:
def partition(x):
    if x < 3:
        return 'negative'
    return 'positive'

In [45]:
Score = messages['Score']
Score = Score.map(partition)
Summary = messages['Summary']
X_train, X_test, y_train, y_test = train_test_split(Summary, Score, test_size=0.2, random_state=42)

In [46]:
tmp = messages
tmp['Score'] = tmp['Score'].map(partition)
print(tmp.head(20))

       Score                                            Summary
0   positive                              Good Quality Dog Food
1   negative                                  Not as Advertised
2   positive                              "Delight" says it all
3   negative                                     Cough Medicine
4   positive                                        Great taffy
5   positive                                         Nice Taffy
6   positive      Great!  Just as good as the expensive brands!
7   positive                             Wonderful, tasty taffy
8   positive                                         Yay Barley
9   positive                                   Healthy Dog Food
10  positive                    The Best Hot Sauce in the World
11  positive  My cats LOVE this "diet" food better than thei...
12  negative               My Cats Are Not Fans of the New Food
13  positive                                  fresh and greasy!
14  positive                       Straw

In [68]:
stemmer = PorterStemmer()
from nltk.corpus import stopwords

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    #tokens = [word for word in tokens if word not in stopwords.words('english')]
    stems = stem_tokens(tokens, stemmer)
    return ' '.join(stems)

In [48]:
intab = string.punctuation
outtab = "                                "
trantab = str.maketrans(intab, outtab)

In [49]:
corpus = []
for text in X_train:
    text = text.lower()
    text = text.translate(trantab)
    text=tokenize(text)
    corpus.append(text)

In [74]:
#print(X_train, y_train)
t = tokenize("Delight says it all")
print(t)

t = '   '.join(t)
print(t)

delight say it all
d   e   l   i   g   h   t       s   a   y       i   t       a   l   l


In [78]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(corpus)        
print(count_vect, X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)   (0, 3111)	1
  (0, 8738)	1
  (0, 970)	1
  (1, 21041)	1
  (1, 1134)	1
  (1, 7933)	1
  (1, 12968)	1
  (1, 13530)	1
  (1, 17174)	1
  (1, 20049)	2
  (1, 9203)	1
  (1, 19883)	1
  (1, 13433)	1
  (2, 18946)	1
  (2, 19835)	1
  (2, 11703)	1
  (2, 21463)	1
  (3, 19536)	1
  (3, 7879)	1
  (3, 7102)	1
  (4, 13914)	1
  (4, 7686)	1
  (4, 8738)	1
  (5, 6018)	1
  (5, 19830)	1
  :	:
  (420645, 19779)	1
  (420646, 13069)	1
  (420646, 7842)	1
  (420646, 3358)	1
  (420646, 20986)	1
  (420646, 4868)	1
  (420646, 8778)	1
  (420646, 4321)	1
  (420646, 1134)	1
  (420647, 21562)	1
  (420647, 15906)	1
  (420647, 20117)	1
  (420647, 1755)	1
  (

In [79]:
from pandas import *
df = DataFrame({'Before': X_train, 'After': corpus})
print(df.head(20))

prediction = dict()

                                                    After  \
496497                                   almond great buy   
225396  i never thought i d have to say no to more fru...   
288197                                  we love thi stuff   
88450                                  fan friggen tastic   
354669                                    great for offic   
331410                                what have they done   
456920                                      cri babi tear   
261055      good cereal with a few potenti health concern   
295869                                       a quick meal   
149914                                        good flavor   
368576                                 best bbq sauc ever   
192328                        best lemonad mix ive bought   
123408     if you like the tast of chai you will love thi   
114477               the best basic ramen noodl out there   
108795                                           huge bag   
315703                  