In [1]:
import pandas as pd
from textblob import Word
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import wordnet as wn


In [2]:
# Lazy way to verify certificate. Don't run unless you need to.

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [3]:
stop = stopwords.words('english')

In [4]:
nltk.download('wordnet')

food_sets = wn.synsets('food')

food_stop_words = list()

for food_set in food_sets:
    food_stop_words += list(set([w.replace('_', ' ') for s in
                            food_set.closure(lambda s: s.hyponyms())
                            for w in s.lemma_names()]))

[nltk_data] Downloading package wordnet to /Users/kevin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Import data to Pandas dataframe

df = pd.read_json('review0.json', lines=True)
df.head(5)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,0W4lkclzZThpx3V65bVgig,0,2016-05-28,0,v0i_UHJMo_hPBq9bxWvW4w,5,"Love the staff, love the meat, love the place....",0,bv2nCi5Qv5vroFiqKGopiw
1,AEx2SYEUJmTxVVB18LlCwA,0,2016-05-28,0,vkVSCC7xljjrAI4UGfnKEQ,5,Super simple place but amazing nonetheless. It...,0,bv2nCi5Qv5vroFiqKGopiw
2,VR6GpWIda3SfvPC-lg9H3w,0,2016-05-28,0,n6QzIUObkYshz4dz2QRJTw,5,Small unassuming place that changes their menu...,0,bv2nCi5Qv5vroFiqKGopiw
3,CKC0-MOWMqoeWf6s-szl8g,0,2016-05-28,0,MV3CcKScW05u5LVfF6ok0g,5,Lester's is located in a beautiful neighborhoo...,0,bv2nCi5Qv5vroFiqKGopiw
4,ACFtxLv8pGrrxMm6EgjreA,0,2016-05-28,0,IXvOzsEMYtiJI0CARmj77Q,4,Love coming here. Yes the place always needs t...,0,bv2nCi5Qv5vroFiqKGopiw


In [6]:
col = ['stars', 'text']
cleaned_df = df[col]
cleaned_df = cleaned_df[pd.notnull(cleaned_df['text'])]
cleaned_df.columns = ['rating', 'review']

# Bin the reviews into 3 classes

cleaned_df['bin'] = pd.cut(cleaned_df['rating'], [0, 2, 4, float('inf'
                           )], labels=['1', '2', '3'])

# Set Dataframe to lower

cleaned_df['review'] = cleaned_df['review'].apply(lambda x: \
        ' '.join(x.lower() for x in x.split()))

# Remove symbols

cleaned_df['review'] = cleaned_df['review'].str.replace('[^\w\s]', '')

# Lemmatize

cleaned_df['review'] = cleaned_df['review'].apply(lambda x: \
        ' '.join([Word(word).lemmatize() for word in x.split()]))

# Add Sentiment column

cleaned_df['sentiment'] = cleaned_df['review'].apply(lambda x: \
        TextBlob(x).sentiment[0])

# Remove stop words

cleaned_df['review'] = cleaned_df['review'].apply(lambda x: ' '.join(x
        for x in x.split() if x not in stop))
cleaned_df['review'] = cleaned_df['review'].apply(lambda x: ' '.join(x
        for x in x.split() if x not in food_stop_words))


In [7]:
cleaned_df.head(5)

Unnamed: 0,rating,review,bin,sentiment
0,5,love staff love love place prepare long line a...,3,0.131481
1,5,super simple place amazing nonetheless around ...,3,0.251389
2,5,small unassuming place change every often cool...,3,0.320833
3,5,lesters located beautiful neighborhood ha sinc...,3,0.45
4,4,love coming yes place always need floor swept ...,2,0.129688


In [9]:
tfidf = TfidfVectorizer(
    sublinear_tf=True,
    min_df=5,
    norm='l2',
    encoding='latin-1',
    ngram_range=(1, 2),
    stop_words=food_stop_words,
    )

# Convert the reviews to tf-idf features

features = tfidf.fit_transform(cleaned_df.review)

# Obtain the class labels 'bin'

labels = cleaned_df.bin

# Add column for sentiment

features = hstack((features, np.array(cleaned_df['sentiment'])[:,
                  None]))

# Print the shape (row, columns)

print (features.shape)


(5000, 11796)


In [13]:
# Bagged LR for Classification
seed = 7
kfold = model_selection.KFold(n_splits=20, random_state=seed)
cart = LogisticRegression()
num_trees = 5
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, features, labels, cv=kfold)
print(results.mean())

0.6854


In [14]:
model.fit(features,labels)


BaggingClassifier(base_estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=5, n_jobs=1, oob_score=False,
         random_state=7, verbose=0, warm_start=False)

In [23]:
input_text = input("Enter Review: ")
input_text = [input_text]
input_text = pd.DataFrame(input_text, columns=['a'])

# Set Dataframe to lower 
input_text['a'] = input_text['a'].str.lower()

# # Remove symbols
input_text['a'] = input_text['a'].str.replace('[^\w\s]','')

# # Lemmatize 
# cleaned_df['a'] = cleaned_df['a'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

# # Add Sentiment column
input_text['sentiment'] = input_text['a'].apply(lambda x: TextBlob(x).sentiment[0] )


input_transform = tfidf.transform(input_text['a'])
input_transform = hstack((input_transform, np.array(input_text['sentiment'])[:,
                  None]))



print(model.predict(input_transform)[0])

Enter Review: the food was amazing
3
