In [31]:
import pandas as pd
import numpy as np
import math
import string
#--turn of warning messages
pd.options.mode.chained_assignment = None  # default='warn'

#--get data
df = pd.read_csv('amazon_baby.csv')
df

Unnamed: 0,name,review,rating
0,Planetwise Flannel Wipes,"These flannel wipes are OK, but in my opinion ...",3
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5
...,...,...,...
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5


In [32]:
df.isnull().sum(axis = 0)

name      318
review    829
rating      0
dtype: int64

In [33]:
df = df.dropna()

In [34]:
df = df[df['rating'] != 3]
len(df)

165679

In [35]:
df['Sentiment'] = np.where(df.rating > 3, 1, -1)
#df['sentiment'] = df['rating'].apply(lambda rating : +1 if rating > 3 else -1)
df

Unnamed: 0,name,review,rating,Sentiment
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1
...,...,...,...,...
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5,1
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5,1
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5,1
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5,1


In [36]:
import string 
def remove_punctuation(text):
    try: # python 2.x
        text = text.translate(None, string.punctuation) 
    except: # python 3.x
        translator = text.maketrans('', '', string.punctuation)
        text = text.translate(translator)
    return text

review_without_punctuation = df['review'].apply(remove_punctuation)

In [37]:
def word_count(str):
    counts = dict()
    words = str.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1

    return counts
df['word_count'] = review_without_punctuation.apply(word_count)
df
#Using CountVectorizer would make things simpler.

Unnamed: 0,name,review,rating,Sentiment,word_count
1,Planetwise Wipe Pouch,it came early and was not disappointed. i love...,5,1,"{'it': 3, 'came': 1, 'early': 1, 'and': 3, 'wa..."
2,Annas Dream Full Quilt with 2 Shams,Very soft and comfortable and warmer than it l...,5,1,"{'Very': 1, 'soft': 1, 'and': 2, 'comfortable'..."
3,Stop Pacifier Sucking without tears with Thumb...,This is a product well worth the purchase. I ...,5,1,"{'This': 1, 'is': 4, 'a': 2, 'product': 2, 'we..."
4,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1,"{'All': 1, 'of': 1, 'my': 1, 'kids': 2, 'have'..."
5,Stop Pacifier Sucking without tears with Thumb...,"When the Binky Fairy came to our house, we did...",5,1,"{'When': 1, 'the': 5, 'Binky': 3, 'Fairy': 3, ..."
...,...,...,...,...,...
183526,Baby Teething Necklace for Mom Pretty Donut Sh...,Such a great idea! very handy to have and look...,5,1,"{'Such': 1, 'a': 1, 'great': 2, 'idea': 1, 've..."
183527,Baby Teething Necklace for Mom Pretty Donut Sh...,This product rocks! It is a great blend of fu...,5,1,"{'This': 1, 'product': 2, 'rocks': 1, 'It': 1,..."
183528,Abstract 2 PK Baby / Toddler Training Cup (Pink),This item looks great and cool for my kids.......,5,1,"{'This': 1, 'item': 1, 'looks': 1, 'great': 2,..."
183529,"Baby Food Freezer Tray - Bacteria Resistant, B...",I am extremely happy with this product. I have...,5,1,"{'I': 9, 'am': 2, 'extremely': 1, 'happy': 1, ..."


In [137]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(review_without_punctuation, df.Sentiment, test_size=0.2, random_state=0)

In [138]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df = 500, ngram_range = (1, 2)).fit(X_train)
vectorized = vectorizer.transform(X_train)
print(vectorized)

  (0, 53)	1
  (0, 108)	1
  (0, 114)	1
  (0, 233)	1
  (0, 265)	1
  (0, 268)	1
  (0, 355)	1
  (0, 356)	1
  (0, 432)	3
  (0, 489)	1
  (0, 501)	1
  (0, 628)	1
  (0, 889)	1
  (0, 976)	2
  (0, 996)	1
  (0, 1236)	1
  (0, 1238)	1
  (0, 1271)	1
  (0, 1371)	5
  (0, 1378)	1
  (0, 1400)	1
  (0, 1421)	1
  (0, 1425)	1
  (0, 1428)	1
  (0, 1438)	3
  :	:
  (132542, 2114)	1
  (132542, 2157)	1
  (132542, 2188)	1
  (132542, 2292)	1
  (132542, 2382)	1
  (132542, 2424)	1
  (132542, 2591)	1
  (132542, 2651)	1
  (132542, 2713)	1
  (132542, 2770)	2
  (132542, 2969)	1
  (132542, 2979)	1
  (132542, 2980)	1
  (132542, 3008)	1
  (132542, 3081)	1
  (132542, 3226)	1
  (132542, 3335)	1
  (132542, 3336)	1
  (132542, 3455)	1
  (132542, 3468)	1
  (132542, 3476)	1
  (132542, 3572)	1
  (132542, 3573)	1
  (132542, 3604)	1
  (132542, 3606)	1


In [139]:
from sklearn.linear_model import LogisticRegression

lrg = LogisticRegression(max_iter= 1000)
lrg.fit(vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [140]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

pred = lrg.predict(vectorizer.transform(X_test))
print("Accuracy Score:")
print(accuracy_score(y_test, pred))
print("--------------------")

print('Classification Stats:')
print(classification_report(y_test, pred))

Accuracy Score:
0.938284645098986
--------------------
Classification Stats:
              precision    recall  f1-score   support

          -1       0.84      0.75      0.79      5222
           1       0.95      0.97      0.96     27914

    accuracy                           0.94     33136
   macro avg       0.90      0.86      0.88     33136
weighted avg       0.94      0.94      0.94     33136



In [141]:
#--Model Deployment  --optional
from sklearn.externals import joblib
import os
#--save models to be deployed on your server
if not os.path.exists('Model'):
    os.mkdir('Model')
if not os.path.exists('Vect'):
    os.mkdir('Vect') 
    
joblib.dump(lrg, r'Model/model.pickle') 
joblib.dump(vectorizer, r'Vect/vect.pickle')


['Vect/vect.pickle']

In [142]:
#--Prediction in Action
#--load model and scaler objects
model = joblib.load(r'Model/model.pickle')
vectorify = joblib.load(r'Vect/vect.pickle')

In [164]:
def Predict(text):
    txt = remove_punctuation(text)
    p = model.predict_proba(vectorify.transform([txt]))
    if p[0][0] > 0.5:
        print('Sentiment type: Negative, \nProbability:', p[0][0] * 100,'%')
    else: 
        print('Sentiment type: Positive, \nProbability:', p[0][1] * 100,'%')

In [165]:
Predict(''' It is hard to handle and I would not suggest purchasing it.''')

Sentiment type: Negative, 
Probability: 75.88127719565728 %


In [166]:
Predict('''Don't waste your money on this product ''')

Sentiment type: Negative, 
Probability: 92.7022079548014 %


In [167]:
Predict('''Amazing Product, just loved it.''')

Sentiment type: Positive, 
Probability: 96.07875773027237 %


In [168]:
Predict('''It is ok, but not that good.''')

Sentiment type: Positive, 
Probability: 55.96267196037722 %
