### Obtaining a subset of the data from https://nijianmo.github.io/amazon/index.html#complete-data

In [67]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm 
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

### Putting the data in a pandas dataframe. 

In [68]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Grocery_and_Gourmet_Food_5.json.gz')

### Subset the data to what we need and store it in dg. 



In [69]:
dg = df[['overall','reviewText']]

### Cleaning the data. 


#### (a) handle NaNs. 

In [70]:
df.isnull().sum()

overall                 0
verified                0
reviewTime              0
reviewerID              0
asin                    0
reviewerName          138
reviewText            390
summary               219
unixReviewTime          0
vote               985658
style              551774
image             1134350
dtype: int64

In [71]:
#remove nulls
dg.dropna(inplace=True)

### (b) add a sentiment column and set it to one if the overall rating was greater than 3 and to zero if the rating was less than 3

In [72]:
dg.loc[dg['overall'] > 3.0, 'sentiment'] = 1
dg.loc[dg['overall'] < 3.0, 'sentiment'] = 0

In [73]:
#Remove any reviews with a neutral rating of 3 stars.
dg.dropna(axis=0, how = 'any', inplace=True)


In [74]:
#Convert to lower case to reduce complexity of bag of words matrix. 
dg['reviewText'] = dg['reviewText'].map(lambda x: x.lower())

### final data before fitting. 

In [75]:

dg.head()

Unnamed: 0,overall,reviewText,sentiment
0,5.0,no adverse comment.,1.0
1,5.0,gift for college student.,1.0
2,5.0,"if you like strong tea, this is for you. it mi...",1.0
3,5.0,love the tea. the flavor is way better than th...,1.0
4,5.0,i have searched everywhere until i browsed ama...,1.0


In [76]:
#Initialize vectorizer. 
vectorizer = CountVectorizer()

In [77]:
#set X and y. 
X,y= dg.reviewText, dg.sentiment

### split the data in train and test sets. 

In [78]:
# Split the data.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1000, random_state = 42, train_size = 5000)

In [79]:
# Convert to bow matrices. 
X_train_bow, X_test_bow = vectorizer.fit_transform(X_train),vectorizer.transform(X_test)


### The data is ready for fitting models and I am going to be using several models. 

### SVM linear

In [85]:
 svc_lin = svm.SVC(kernel="linear", C=0.025)
 svc_lin.fit(X_train_bow, y_train)
 svc_lin.score(X_test_bow,y_test)


0.925

### SVM non linear


In [83]:
clf = svm.SVC(gamma = 'auto', C =1)
clf.fit(X_train_bow,y_train)
clf.score(X_test_bow,y_test)


0.917

### Classification Tree

In [63]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train_bow,y_train)
clf.score(X_test_bow,y_test)

0.89

### Random Forrest


In [66]:
clf = RandomForestClassifier( random_state=0)
clf.fit(X_train_bow,y_train)
clf.score(X_test_bow,y_test)

0.95