# Yelp Data Challenge - NLP

BitTiger DS501

Sep 2018

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('clean_busi_rev_joint.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488769 entries, 0 to 488768
Data columns (total 14 columns):
business_id     488769 non-null object
categories      488769 non-null object
city            488769 non-null object
name            488769 non-null object
review_count    488769 non-null int64
avg_stars       488769 non-null float64
cool            488769 non-null int64
date            488769 non-null object
funny           488769 non-null int64
review_id       488769 non-null object
stars           488769 non-null int64
text            488769 non-null object
useful          488769 non-null int64
user_id         488769 non-null object
dtypes: float64(1), int64(5), object(8)
memory usage: 52.2+ MB


### Define your feature variables, here is the text of the review

In [3]:
df.head(5)

Unnamed: 0,business_id,categories,city,name,review_count,avg_stars,cool,date,funny,review_id,stars,text,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,"Steakhouses, Restaurants, Cajun/Creole",Las Vegas,Delmonico Steakhouse,1546,4.0,0,2017-02-14,0,VETXTwMw6qxzOVDlXfe6Tg,5,went for dinner tonight. Amazing my husband ha...,0,ymlnR8UeFvB4FZL56tCZsA
1,--9e1ONYQuAa-CB_Rrw7Tw,"Steakhouses, Restaurants, Cajun/Creole",Las Vegas,Delmonico Steakhouse,1546,4.0,0,2017-12-04,0,S8-8uZ7fa5YbjnEtaW15ng,5,This was an amazing dinning experience! ORDER ...,0,9pSSL6X6lFpY3FCRLEH3og
2,--9e1ONYQuAa-CB_Rrw7Tw,"Steakhouses, Restaurants, Cajun/Creole",Las Vegas,Delmonico Steakhouse,1546,4.0,0,2016-08-22,1,1nK5w0VNfDlnR3bOz13dJQ,5,My husband and I went there for lunch on a Sat...,1,gm8nNoA3uB4In5o_Hxpq3g
3,--9e1ONYQuAa-CB_Rrw7Tw,"Steakhouses, Restaurants, Cajun/Creole",Las Vegas,Delmonico Steakhouse,1546,4.0,0,2016-09-13,0,N1Z93BthdJ7FT2p5S22jIA,3,Went for a nice anniversary dinner. Researched...,0,CEtidlXNyQzgJSdF1ubPFw
4,--9e1ONYQuAa-CB_Rrw7Tw,"Steakhouses, Restaurants, Cajun/Creole",Las Vegas,Delmonico Steakhouse,1546,4.0,0,2016-08-08,0,ir-EVhHyWna7KqYWtj660g,5,Hands down the best meal and service I have ev...,0,9_BhDyzJYf2JwTD9TyXJ4g


In [5]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents=df['text'].values
documents[10]

'10/27/2016-I had my birthday dinner here and it was worth the splurge.  I have to say something about the décor - it was too plain and sterile and I expected a little more character from this establishment.  I would have added some wood paneling or some framed mirrors.  But the food was worth it.  We ordered the New Orleans BBQ Shrimp, Butternut Squash Ravioli, Traditional New Orleans Gumbo, and of course the 16 oz Rib Eye steak.  Our side dishes included the Country Smashed Potato, Delmonico Creamed Spinach, Buttered Fresh Asparagus, and Sauteed Garlic Mushrooms.  All the dishes we ordered were delicious.  Our steak was medium rare and it was sooo good.  The meat was tender with the right amount of fatty goodness.  My favorite sides were the asparagus (cooked just right, not over or under cooked) and the mushrooms.  The creamed spinach was a little too salty for me.  They added a nice touch to my birthday by serving complimentary appetizer and dessert (mango sorbet)'

In [4]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
print(documents.isnull().sum(),len(documents))


0 488769


### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [7]:
# Make a column and take the values, save to a variable named "target"
df['target_2']=[1 if element > 4 else 0 for element in df['stars']]
df['target']=df['stars']>4
print(df['target'][0:5],df['target_2'][0:5])

0     True
1     True
2     True
3    False
4     True
Name: target, dtype: bool 0    1
1    1
2    1
3    0
4    1
Name: target_2, dtype: int64


#### You may want to look at the statistic of the target variable

In [8]:
# To be implemented
target=df['target']
print(sum(target),len(target))

237791 488769


## Let's create training dataset and test dataset

In [9]:
from sklearn.cross_validation import train_test_split



In [10]:
# Documents is your X, target is your y
# Now split the data to training set and test set
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, test_size=0.3, random_state=0)

In [48]:
documents_train.shape


(342138,)

## Let's get NLP representation of the documents

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
type(vectorizer)

sklearn.feature_extraction.text.TfidfVectorizer

In [14]:
# Train the model with your training data
vectors = vectorizer.fit_transform(documents_train).toarray()
words = vectorizer.get_feature_names()

In [15]:
words[100:110]

['asian',
 'aside',
 'ask',
 'asked',
 'asking',
 'asparagus',
 'ass',
 'ate',
 'atmosphere',
 'attention']

In [16]:
# Get the vocab of your tfidf
print(vectors.shape,vectors[0:5],words[0:5])

(342138, 2000) [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.35126663 0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]] ['00', '10', '100', '11', '12']


In [17]:
# Use the trained model to transform your test data
y_test = target_test
X_test = vectorizer.transform(documents_test)

## Similar review search engine

In [18]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    pass  # To be implemented


### top 20 words in the training set

In [25]:
avg = np.sum(vectors, axis=0) 
print("top 20 by average tf-idf")
print(get_top_values(avg, 20, words))


top 20 by average tf-idf
['food', 'great', 'good', 'place', 'service', 'time', 'just', 'like', 'best', 'vegas', 'amazing', 'really', 'delicious', 'chicken', 'love', 'ordered', 'restaurant', 'definitely', 'order', 'got']


### look for top 5 reviews similar to search query

In [19]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
# Draw an arbitrary review from test (unseen in training) documents
search_query=documents_test[0]
print(search_query)

I'm not giving you 5 Stars today because of the service. It's Monday morning, Memorial Day, there's no line out here, at 9 o'clock in the morning and there are numbers of empty tables in your restaurant. We ordered our food that part of the service was wonderful but then we felt like they rushed us to get the heck off their table. Could I please finish my cup of coffee for a $50 breakfast?


In [24]:
# Transform the drawn review(s) to vector(s)
from nltk.corpus import stopwords
search_query_vectorized = vectorizer.transform([search_query]).toarray() # search_query needs to be a list format! [search_query]
search_query_vectorized
search_query_vectorized.shape

(1, 2000)

In [25]:
simi_score_search=cosine_similarity(search_query_vectorized,vectors)

In [27]:
print(simi_score_search[0:10])

[[0.09673526 0.05190409 0.11683443 ... 0.033286   0.06492736 0.07845691]]


In [28]:
#find the top 5 silimar reviews
n_rev=5
returned_reviews=get_top_values(simi_score_search[0],n_rev,documents_train.tolist())

In [29]:
returned_reviews

['Heather was wonderful. Excellent fast and hot food with the best service and smiles for is this morning. We will always be back  for breakfast here.',
 "This place is s must for your Morning start.. Today we came on a Monday and it was pleasant not to have to wait a long line..although we would do it any time because it's worth it...",
 'Breakfast- good; staff- friendly; food service-a little slow; atmosphere- relaxed, not too crowded (Monday morning).',
 "I would give this place 0 stars if I could... I'm currently fighting off food poisoning from the food I ate this morning.",
 'Had breakfast this morning and as always the food and service is great.  Maggi was attentive and very sweet, Jose cooked our food and it was delicious.  Five stars!']

In [33]:
# Calculate the similarity score(s) between vector(s) and training vectors
print('Compare "%s" \nwith "%s"'%(search_query, documents_train[0]))
print(cosine_similarity(search_query_vectorized.reshape(1, -1), vectors[1].reshape(1, -1)))
print(search_query_vectorized.reshape(1, -1))

Compare "I'm not giving you 5 Stars today because of the service. It's Monday morning, Memorial Day, there's no line out here, at 9 o'clock in the morning and there are numbers of empty tables in your restaurant. We ordered our food that part of the service was wonderful but then we felt like they rushed us to get the heck off their table. Could I please finish my cup of coffee for a $50 breakfast?" 
with "I'm jumping on the 2 star train. 

They're lucky they are even getting 2 stars from me. The 2 stars come the bartender who was nice and the corned beef hash that was delicious. I feel the buffet is a bit over priced. $33 a head for early bird breakfast and mimosas that are supposed to be bottomless. The food was average at best; nothing "grand" about this buffet. The mimosas are not brought to the table, you have to actually go to the bar and get your mimosas. We were at our table for about 20 minutes waiting for the bar to open. The hosts kept telling us the bartender was coming s

In [34]:
simi_score=list()
for index in range(len(documents_train[1:])):
    #print('"%s" compared with "%s"'%(documents_train[0], documents_train[index+1]))
    #print('cosine similarity:', cosine_similarity(vectors[0].reshape(1, -1),vectors[index+1].reshape(1, -1)))
    simi_score.append(cosine_similarity(search_query_vectorized.reshape(1, -1),vectors[index+1].reshape(1, -1)))
        

In [35]:
test=cosine_similarity(vectors[0].reshape(1, -1),vectors[1].reshape(1, -1))
vectors[1]

array([0., 0., 0., ..., 0., 0., 0.])

In [50]:
test_2=cosine_similarity(vectors[0],vectors[1])

ValueError: Expected 2D array, got 1D array instead:
array=[0. 0. 0. ... 0. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [49]:
vectors[1].reshape(1,-1)

array([[0., 0., 0., ..., 0., 0., 0.]])

In [36]:
print(simi_score[0:10], type(simi_score))

[array([[0.05190409]]), array([[0.11683443]]), array([[0.00579007]]), array([[0.]]), array([[0.01074607]]), array([[0.03475928]]), array([[0.0314713]]), array([[0.0151044]]), array([[0.01550143]]), array([[0.11866998]])] <class 'list'>


In [37]:
sorted=np.sort(simi_score,axis=None)
sorted
sorted_ind=np.argsort(simi_score,axis=None)
sorted_ind

array([269006,  83141,  83138, ..., 292396, 270285,  65694], dtype=int64)

In [38]:
# Let's find top 5 similar reviews

n = 5
sorted_5=sorted_ind[len(sorted)-6:-1]
print(documents_train[sorted_5])

['Wow. Just pretty bad . I was running errands and I stopped in since I was in that area, just because I had a coupon for a nice new salad. Well I pull in heading towards the driveway and there were cones blocking the entrance. Ok. I\'ll get out and go in to order my avocado salad with the coupon . The girl screams " we can\'t take credit cards or coupons our system is down " Ok . Mientras todos los empleados estan hablando en Español tan ruidoso de frente de los clientes que no son hispanohablantes. This can be perceived by non speaking clients as rude, I speak and understand Spanish. This isn\'t a legit Mexican place , Not a local place that caters to a Latino clientele . I know that being in the service industry myself, corporate at that, it is a No-No. English should be spoken up in the front and Spanish in the back. So anyways, the girl tells me no credit. I nodded. Then I pull out the coupon to show her the new salad I wanted, and before I can finish , she says " No coupons " I s

In [39]:
print('Our search query:')
print(search_query)
print("Vectorized query:", search_query_vectorized)

Our search query:
I'm not giving you 5 Stars today because of the service. It's Monday morning, Memorial Day, there's no line out here, at 9 o'clock in the morning and there are numbers of empty tables in your restaurant. We ordered our food that part of the service was wonderful but then we felt like they rushed us to get the heck off their table. Could I please finish my cup of coffee for a $50 breakfast?
Vectorized query: [[0. 0. 0. ... 0. 0. 0.]]


In [40]:
print('Most %s similar reviews:' % n)
print(documents_train[sorted_5])  # this give the same results as the output of last cell

Most 5 similar reviews:
['Wow. Just pretty bad . I was running errands and I stopped in since I was in that area, just because I had a coupon for a nice new salad. Well I pull in heading towards the driveway and there were cones blocking the entrance. Ok. I\'ll get out and go in to order my avocado salad with the coupon . The girl screams " we can\'t take credit cards or coupons our system is down " Ok . Mientras todos los empleados estan hablando en Español tan ruidoso de frente de los clientes que no son hispanohablantes. This can be perceived by non speaking clients as rude, I speak and understand Spanish. This isn\'t a legit Mexican place , Not a local place that caters to a Latino clientele . I know that being in the service industry myself, corporate at that, it is a No-No. English should be spoken up in the front and Spanish in the back. So anyways, the girl tells me no credit. I nodded. Then I pull out the coupon to show her the new salad I wanted, and before I can finish , she

#### Q: Does the result make sense to you?

A: Yes, the top 5 are all negative reviews like the search query

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [15]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(vectors, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
# Get score for training set
model.score(vectors, target_train)

0.8075513389334128

In [17]:
# Get score for test set
model.score(X_test, y_test)

0.8074145303516992

#### Logistic Regression Classifier

In [41]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression
model_1= LogisticRegression()
model_1.fit(vectors,target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
# Get score for training set
model_1.score(vectors, target_train)

0.8342014041117911

In [19]:
# Get score for test set
model_1.score(X_test, y_test)

0.8306019873014574

In [42]:
model_1.coef_

array([[-1.04143952e+00,  1.06228843e+00,  1.26364661e+00, ...,
         1.17322515e+00, -2.89675024e+00,  7.55185028e-04]])

In [44]:
n_word = 20
get_top_values(model_1.coef_[0], n_word, words)

['amazing',
 'best',
 'incredible',
 'thank',
 'awesome',
 'phenomenal',
 'perfection',
 'heaven',
 'delicious',
 'perfect',
 'highly',
 'fantastic',
 'excellent',
 'great',
 'favorite',
 'impeccable',
 'outstanding',
 'love',
 'holy',
 'fabulous']

#### Q: What are the key features(words) that make the positive prediction?

In [45]:
# Let's find it out by ranking
n_pos = 20
rev_pos=df[df['stars']>=4]['text']
print(rev_pos[0:5])
pos_vec=vectorizer.transform(rev_pos).toarray()
avg_pos = np.sum(pos_vec>0, axis=0) 
print(len(avg_pos))
print("top 30 by positive tf-idf")
print(get_top_values(avg_pos,n_pos,words))

0    went for dinner tonight. Amazing my husband ha...
1    This was an amazing dinning experience! ORDER ...
2    My husband and I went there for lunch on a Sat...
4    Hands down the best meal and service I have ev...
5    ABSOLUTE MUST IN VEGAS! Loved everything my bo...
Name: text, dtype: object
2000
top 30 by positive tf-idf
['food', 'great', 'place', 'good', 'service', 'delicious', 'time', 'vegas', 'best', 'like', 'amazing', 'just', 'definitely', 'really', 'friendly', 'try', 'love', 'restaurant', 'nice', 'ordered']


A: delightful,beautifully sounds positive

#### Q: What are the key features(words) that make the negative prediction?

In [109]:
# Let's find it out by ranking
n_neg= 20
rev_neg=df[df['stars']<4]['text']
print(rev_neg[0:5])
neg_vec=vectorizer.transform(rev_neg).toarray()
avg_neg = np.sum(neg_vec, axis=0) 
print("top 20 by neg tf-idf")
print(get_top_values(avg_neg,n_neg,words))

3     Went for a nice anniversary dinner. Researched...
7     I had high hopes for Delmonico's Steakhouse in...
11    Good food.  Horrible service.  Had dinner in e...
12    My wife and I were very excited to visit the r...
19    Great ribeye steak.  Cooked perfectly.  Was a ...
Name: text, dtype: object
top 20 by neg tf-idf
['french', 'greet', 'kimchi', 'located', 'shame', 'owner', 'plenty', 'owners', 'till', 'difficult', 'downside', 'mouth', 'grits', 'attentive', 'cakes', 'opinion', 'appetizers', 'salty', 'red', 'bed']


A: difficult,downside,grits,salty sounds negative, frech,greet,shame,plenty,red,kimchi


#### Random Forest Classifier

In [18]:
# Build a Random Forest Classifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier(n_estimators=100,max_depth=20,min_samples_leaf=20, random_state=1)
model_2= OneVsRestClassifier(estimator,n_jobs=-1)
model_2.fit(vectors,target_train)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=1, verbose=0, warm_start=False),
          n_jobs=-1)

In [19]:
# Get score for training set
model_2.score(vectors, target_train)

0.796812981896194

In [20]:
# Get score for test set
model_2.score(X_test, y_test)

0.7901262352435706

#### Q: What do you see from the training score and the test score?

A: the training set has a better score than the test set, suggesting overfitting

#### Q: Can you tell what features (words) are important by inspecting the RFC model?
The results below suggest the words in the food categories have the highest tf-idf

In [111]:
n = 20
avg = np.sum(vectors, axis=0) / np.sum(vectors > 0, axis=0)
print("top 20 by average tf-idf")
print(get_top_values(avg, n, words))


top 20 by average tf-idf
['hookah', 'ramen', 'pho', 'karaoke', 'udon', 'crawfish', 'donuts', 'bagel', 'boba', 'donut', 'indian', 'mi', 'kbbq', 'gyro', 'filipino', 'catfish', 'smoothie', 'pastrami', 'greek', 'philly']


## Extra Credit #1: Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [16]:
# To be implemented
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

train_mse_arr = np.array([])
test_mse_arr = np.array([])

#for i in pca_range:

    #train_subset = vectors[:, :i]

    #model_1.fit(train_subset, target_train)

    # Get train error
train_mse = cross_val_score(model_1, vectors, y=target_train,
                                scoring='neg_mean_squared_error', cv=5) * -1
train_mse_arr = np.append(train_mse_arr, train_mse.mean())
    
    # Get test error
test_set = X_test
test_mse = mean_squared_error(model_1.predict(test_set), y_test)
test_mse_arr = np.append(test_mse_arr, test_mse)

In [27]:
print(train_mse,test_mse_arr)

[0.16763606 0.16858596 0.16770912 0.1702081  0.17110455] [0.16939801]


## Extra Credit #2: Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [21]:
# To be implemented
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV

# Choose the type of classifier. 
clf = RandomForestClassifier()

# Choose some parameter combinations to try
param_grid = {'n_estimators': [50], 
              'max_features': ['auto'], 
              'criterion': ['gini'],
              'max_depth': [10,20], 
              'min_samples_split': [2],
              'min_samples_leaf': [10,20],
              'n_jobs':[-1]
             }

# Type of scoring used to compare parameter combinations
acc_scorer = make_scorer(roc_auc_score)

# Run the grid search
# read theory
grid_obj = GridSearchCV(clf, param_grid, cv=5, scoring=acc_scorer)
grid_obj = grid_obj.fit(vectors, target_train)

# Set the clf to the best combination of parameters
clf = grid_obj.best_estimator_

# Fit the best algorithm to the data. 
clf.fit(vectors, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=20, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)