In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.pipeline import Pipeline

from bs4 import BeautifulSoup  
import re
import nltk
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk import sent_tokenize, word_tokenize, pos_tag

# Read data

In [2]:
df2 = pd.read_json("Electronics_5.json",lines=True)

In [3]:
df2 = df2[["reviewText","overall"]]

In [4]:
df2.columns = ["Reviews","Rating"]

In [5]:
df = pd.read_csv("Amazon_Unlocked_Mobile.csv")

In [6]:
df = df[["Reviews","Rating"]]

In [7]:
df = df.dropna().reset_index(drop=True)

In [8]:
df = df.append(df2)

In [9]:
df = df.dropna().reset_index(drop=True)

In [1]:
df2 = 0

In [11]:
# Split data into training set and validation
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Rating'], \
                                                    test_size=0.1, random_state=0)

print('Load %d training examples and %d validation examples. \n' %(X_train.shape[0],X_test.shape[0]))
print('Show a review in the training set : \n', X_train.iloc[10])

Load 1892669 training examples and 210297 validation examples. 

Show a review in the training set : 
 This thing pretty much covers all the bases as an AC adapter.  Especially nice is the inclusion of the USB port, and the reversibility of the polarity to the plugs.  But what were they thinking with this collection of plugs?  They're attached to a rubber strip which is attached to... NOTHING.  All the plugs other than the one you're using WILL get lost, and probably quickly.There's a baffling little pincer of rubber at the top of the plug strip that doesn't attach to anything securely, including the wire.  It's so dumb that it's infuriating.  They should have had a slot down the side of the adapter to hold the plugs.


In [12]:
df = 0

In [3]:
def cleanText(raw_text, remove_stopwords=False, stemming=False, split_text=False, \
             ):
    '''
    Convert a raw review to a cleaned review
    '''
    text = BeautifulSoup(raw_text, 'lxml').get_text()  #remove html
    letters_only = re.sub("[^a-zA-Z]", " ", text)  # remove non-character
    words = letters_only.lower().split() # convert to lower case 
    
    if remove_stopwords: # remove stopword
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    if stemming==True: # stemming
#         stemmer = PorterStemmer()
        stemmer = SnowballStemmer('english') 
        words = [stemmer.stem(w) for w in words]
        
    if split_text==True:  # split text
        return (words)
    
    return( " ".join(words))

# Preprocessing

In [14]:
# Preprocess text data in training set and validation set
X_train_cleaned = []
X_test_cleaned = []

for d in X_train:
    X_train_cleaned.append(cleanText(d))
print('Show a cleaned review in the training set : \n',  X_train_cleaned[10])
    
for d in X_test:
    X_test_cleaned.append(cleanText(d))

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

Show a cleaned review in the training set : 
 this thing pretty much covers all the bases as an ac adapter especially nice is the inclusion of the usb port and the reversibility of the polarity to the plugs but what were they thinking with this collection of plugs they re attached to a rubber strip which is attached to nothing all the plugs other than the one you re using will get lost and probably quickly there s a baffling little pincer of rubber at the top of the plug strip that doesn t attach to anything securely including the wire it s so dumb that it s infuriating they should have had a slot down the side of the adapter to hold the plugs


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [16]:
X_train, X_test = [0,0]

# TF-IDF Transformation

In [18]:
# Fit and transform the training data to a document-term matrix using TfidfVectorizer 
tfidf = TfidfVectorizer(min_df=5) #minimum document frequency of 5
X_train_tfidf = tfidf.fit_transform(X_train_cleaned)
print("Number of features : %d \n" %len(tfidf.get_feature_names())) #1722
print("Show some feature names : \n", tfidf.get_feature_names()[::1000])

Number of features : 76760 

Show some feature names : 
 ['aa', 'ading', 'alligators', 'anticipates', 'aspecto', 'awide', 'beanbags', 'blankenshipthe', 'braille', 'buys', 'carto', 'chesapeake', 'clutter', 'compressable', 'conversed', 'crookedly', 'dcb', 'descriptioni', 'disapearing', 'doodad', 'dxva', 'emitters', 'espon', 'expressway', 'ferocious', 'flowy', 'fruity', 'getaways', 'graphgrip', 'handsfree', 'hess', 'huddle', 'implosion', 'insiders', 'irreplaceable', 'journalist', 'kokak', 'letdowns', 'loner', 'manchester', 'melodious', 'misgiving', 'mourn', 'nas', 'nofull', 'offas', 'oscillates', 'panasonic', 'performancevery', 'pixilating', 'possess', 'probarla', 'pulpit', 'ramdomly', 'reconfirmed', 'reparation', 'rhinoskin', 'sacked', 'scrool', 'shapeless', 'sinusoidal', 'snares', 'specialist', 'starsbut', 'sua', 'sweatshop', 'telecoms', 'thetop', 'tofigure', 'trice', 'ultraslim', 'unpluged', 'vallarta', 'vlz', 'webtop', 'witholympus', 'xsi']


# Modeling

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingClassifier

# RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [28]:

def modelEvaluation(predictions):
    '''
    Print model evaluation to predicted result 
    '''
    print ("\nAccuracy on validation set: {:.4f}".format(accuracy_score(y_test, predictions)))
    print("\nClassification report : \n", metrics.classification_report(y_test, predictions))
    print("\nConfusion Matrix : \n", metrics.confusion_matrix(y_test, predictions))

In [31]:
# Evaluate on the validaton set
predictions = model.predict(tfidf.transform(X_test_cleaned))
modelEvaluation(predictions)


Accuracy on validation set: 0.6566

Classification report : 
              precision    recall  f1-score   support

          1       0.73      0.47      0.57     18036
          2       0.69      0.18      0.29     10708
          3       0.56      0.17      0.26     17637
          4       0.45      0.24      0.31     40835
          5       0.68      0.93      0.79    123081

avg / total       0.63      0.66      0.61    210297


Confusion Matrix : 
 [[  8402    316    422   1209   7687]
 [  1082   1963    418   1392   5853]
 [   783    201   3024   2857  10772]
 [   502    164    788   9734  29647]
 [   682    185    780   6477 114957]]


In [32]:
import pickle
filename = 'RF-amazon.sav'
pickle.dump(model, open(filename, 'wb'))

In [25]:
pickle.dump(tfidf, open('tfidf.sav', 'wb'))


In [26]:
pickle.dump(X_train_cleaned, open('X_train_cleaned.sav', 'wb'))
pickle.dump(X_test_cleaned, open('X_test_cleaned.sav', 'wb'))
pickle.dump(y_train, open('y_train.sav', 'wb'))
pickle.dump(y_test, open('y_test.sav', 'wb'))


# Deploy prediction

In [1]:
import pickle
model = pickle.load(open('LR-amazon.sav', 'rb'))
tfidf = pickle.load(open('tfidf.sav', 'rb'))



In [13]:
da = model.predict_proba(tfidf.transform([cleanText("Alright, beautiful look and comforting fit DOES NOT excuse it quality wise. My Playstation's headset has better sound quality than these, the stereo sounds like something is blocking it like a hand, with high volume behind it and NO I don't have a defective model(just using a small example). It's just not as great as the reviews I was reading about it made it seem, I had HIGH expectations so I'm now disappointed in its outcome. This headset has absolutely RUINED my built in Mic headset experience, first hooking it up, the volume for the mic is SOOO LOW the pc could barely pick it up and my friends listening on the other end can't stand hearing me through it. This hands free mode bs has angered me to the point where I'm no longer buying built in microphone headsets, it ruins the sound quality of your pc's audio to boost the call/voice chat volume. It's entirely irritating, plus I don't know how to connect the headset via aux cable. normally with other products all i'd do is plug it in(while its connected to the headphones) using the aux chord & model BUT that does NOT work with this headset so, WHY IS THERE AN AUX CABLE?? Maybe I have to read the guide again, but all together I give it a two star, its not something too special to buy yourself if you're a producer like me looking for good quality headphones with a clear precise microphone for voice chats. I'll just use em for my phone, but other than that. I doubt it'll be helpful otherwise. Enjoy with caution.")]))

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):
