In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
data_path = "data/"
places = pd.read_csv(data_path + "places_final.csv")
reviews = pd.read_csv(data_path + "reviews_final.csv")

Create copy of `reviews` dataframe.

In [2]:
df = pd.merge(reviews, places[["gPlusPlaceId", "price"]], how="left", on="gPlusPlaceId")
df.dropna(subset=["reviewTextClean", "price"], inplace=True)
df = df.loc[:, ["price", "rating", "gPlusPlaceId", "reviewTextClean"]]

In [3]:
df

Unnamed: 0,price,rating,gPlusPlaceId,reviewTextClean
0,3.0,3.0,106689630448064755324,cute hotel good amenity nice location great cr...
1,3.0,4.0,108256990636148259283,love place massage facial technician best loun...
6,2.0,5.0,105947477166033397439,service amazing line go fast
7,1.0,5.0,107098981103934500500,get chicken green salad yum
8,1.0,5.0,108585910849109169666,never falafel bar yum super crunchy sweet pota...
...,...,...,...,...
257100,2.0,4.0,110548558285915713747,delicious pizza price good downfall amount peo...
257102,3.0,1.0,107346748950819090586,went mole festival saturday night worst restau...
257103,2.0,5.0,110403843200459675752,42nd street photo camera wa looking best price...
257106,1.0,4.0,101421411984715145689,true life saver


Create BoW representation of `reviewTextClean`.

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
bow_counts = count_vect.fit_transform(df['reviewTextClean'].values)
bow_counts.shape

(96008, 54767)

In [5]:
print("Length of vocabulary:", len(count_vect.vocabulary_))
print("Length of corpus:", bow_counts.shape[0])

Length of vocabulary: 54767
Length of corpus: 96008


Create tf-idf representation of `reviewTextClean`

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
bow_tfidf = tfidf.fit_transform(bow_counts)

In [7]:
from sklearn.model_selection import train_test_split
y = df.price.values-1

# Split into development and test split
X_bow_dev, X_bow_test, X_tfidf_dev, X_tfidf_test, y_dev, y_test = train_test_split(bow_counts, bow_tfidf, y, test_size=0.2, stratify=y)

# Split development into training and validation
X_bow_train, X_bow_val, X_tfidf_train, X_tfidf_val, y_train, y_val = train_test_split(X_bow_dev, X_tfidf_dev, y_dev, test_size=0.1, stratify=y_dev)

In [8]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
def classification_metrics(header_text, clf, X_train, X_test, y_train, y_test):
    print("="*35)
    print(" "*int((35-len(header_text))/2), header_text)
    print("="*35)
    y_pred = clf.predict(X_train)
    print("Train Accuracy:", accuracy_score(y_train, y_pred))
    y_pred = clf.predict(X_test)
    print("Val Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average="weighted"))
    print(confusion_matrix(y_test, y_pred))

In [40]:
# Dummy classifier predicting most frequent class
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_bow_train, y_train)
y_pred = dummy.predict(X_bow_val)
classification_metrics("Dummy classifier", dummy, X_bow_train, X_bow_val, y_train, y_val)

          Dummy classifier
Train Accuracy: 0.558249547920434
Val Accuracy: 0.5582606431454238
F1 Score: 0.40000361564167053
[[   0 1382    0]
 [   0 4288    0]
 [   0 2011    0]]


In [41]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver="saga")
lr.fit(X_bow_train, y_train)
classification_metrics("LogisticRegression BoW", lr, X_bow_train, X_bow_val, y_train, y_val)

       LogisticRegression BoW
Train Accuracy: 0.7251066907775768
Val Accuracy: 0.6378075771383934
F1 Score: 0.6151654448298587
[[ 471  851   60]
 [ 241 3611  436]
 [  28 1166  817]]




In [42]:
lr = LogisticRegression(solver="saga")
lr.fit(X_tfidf_train, y_train)
classification_metrics("LogisticRegression tf-idf", lr, X_tfidf_train, X_tfidf_val, y_train, y_val)

      LogisticRegression tf-idf
Train Accuracy: 0.7312839059674503
Val Accuracy: 0.649654992839474
F1 Score: 0.6289672961670463
[[ 471  847   64]
 [ 224 3621  443]
 [  20 1093  898]]


In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_tfidf_train, y_train)
classification_metrics("RandomForest tf-idf", rfc, X_tfidf_train, X_tfidf_val, y_train, y_val)

In [None]:
import xgboost
xgb = xgboost.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
xgb.fit(X_bow_train, y_train)
classification_metrics("XGBoost BoW", xgb, X_bow_train, X_bow_val, y_train, y_val)

In [38]:
xgb = xgboost.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
xgb.fit(X_tfidf_train, y_train)
classification_metrics("XGBoost tf-idf", xgb, X_tfidf_train, X_tfidf_val, y_train, y_val)

          XGBoost tf-idf
Train Accuracy: 0.6870596745027124
Val Accuracy: 0.626220544199974
F1 Score: 0.5770760772029238
[[ 362  990   30]
 [ 134 3928  226]
 [  12 1479  520]]
