In [151]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.sparse import hstack, vstack, coo_matrix, csr_matrix, bmat
sns.set_style("darkgrid")
data_path = "data/"

# Model building

Build models with:
- BoW or tf-idf features
- Raw review length
- Lix number [see here](https://en.wikipedia.org/wiki/Lix_(readability_test))
- Sentiment score (3 scores)

In [53]:
df = pd.read_csv("data/NLP_data.csv")
df['reviewTextClean'] = df['reviewTextClean'].str.lower()
df.dropna(subset=["reviewTextClean"], inplace=True)

In [136]:
df.head()

Unnamed: 0,rating,reviewText,reviewTextClean,posReviewPercent,negReviewPercent,midReviewPercent,price,LIX,NumberOfWords
0,3.0,This is a very cute hotel with good amenities ...,cute hotel good amenity nice location great cr...,0.141,0.109,0.751,3.0,28.5,50
1,4.0,Love this place. The Great/Good: Massage an...,love place massage facial technician best loun...,0.234,0.051,0.716,3.0,82.047619,63
2,5.0,"service is amazing, the line goes so fast.",service amazing line go fast,0.352,0.0,0.648,2.0,33.0,8
3,5.0,Get the chicken green salad. Yum.,get chicken green salad yum,0.0,0.0,1.0,1.0,19.666667,6
4,5.0,Never had a falafel bar before. Yum. +1 for su...,never falafel bar yum super crunchy sweet pota...,0.266,0.0,0.734,1.0,26.047619,21


In [55]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
bow_counts = count_vect.fit_transform(df['reviewTextClean'].values)
bow_counts.shape

(96001, 33795)

In [56]:
print("Length of vocabulary:", len(count_vect.vocabulary_))
print("Length of corpus:", bow_counts.shape[0])

Length of vocabulary: 33795
Length of corpus: 96001


Create tf-idf representation of `reviewTextClean`

In [57]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
bow_tfidf = tfidf.fit_transform(bow_counts)

Add remaining features to both bow_counts and bow_tfidf

In [137]:
#rem_feats = csr_matrix(df[["posReviewPercent", "negReviewPercent", "midReviewPercent", "LIX", "NumberOfWords"]])
rem_feats = csr_matrix(df[["posReviewPercent", "negReviewPercent", "LIX", "NumberOfWords"]])
bow_counts_full = hstack([bow_counts, rem_feats]).tocsr()
bow_tfidf_full = hstack([bow_tfidf, rem_feats]).tocsr()

First the data is split into training and test. In this case, to keep a completely clean test set, two splits are made. First we split the data into a developement set and a test set. This test set is set aside for final testing once the models has been tuned. The developement set is then split into train and validation such that one can train and test a model as usual. This is done for both the bag of words set and the TF-IDF set in order to test the hypothesis that a TF-IDF performs better than a simple bag of words count. 

In [138]:
from sklearn.model_selection import train_test_split
y = df.price.values-1

# Split into development and test split for bow and tfidf only
X_bow_dev, X_bow_test, X_tfidf_dev, X_tfidf_test, y_dev, y_test = train_test_split(bow_counts, bow_tfidf, y, test_size=0.2, stratify=y)

# Split development into training and validation for bow and tfidf only
X_bow_train, X_bow_val, X_tfidf_train, X_tfidf_val, y_train, y_val = train_test_split(X_bow_dev, X_tfidf_dev, y_dev, test_size=0.1, stratify=y_dev)

In [139]:
# Split into development and test split for bow and tfidf as well as the other features
X_bow_f_dev, X_bow_f_test, X_tfidf_f_dev, X_tfidf_f_test, y_f_dev, y_f_test = train_test_split(bow_counts_full, bow_tfidf_full, y, test_size=0.2, stratify=y)

# Split development into training and validation for bow and tfidf as well as the other features
X_bow_f_train, X_bow_f_val, X_tfidf_f_train, X_tfidf_f_val, y_f_train, y_f_val = train_test_split(X_bow_f_dev, X_tfidf_f_dev, y_dev, test_size=0.1, stratify=y_dev)

In [101]:
# Function that returns performance of classifier 
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
def classification_metrics(header_text, clf, X_train, X_test, y_train, y_test):
    print("="*35)
    print(" "*int((35-len(header_text))/2), header_text)
    print("="*35)
    y_pred = clf.predict(X_train)
    print("Train Accuracy:", accuracy_score(y_train, y_pred))
    y_pred = clf.predict(X_test)
    print("Val Accuracy:", accuracy_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred, average="weighted"))
    print(confusion_matrix(y_test, y_pred))

First a simple dummy classifier was tested using the bag of word counts. It predicts the class of the observation from the most frequent class in the set. This performs about as well as expected with a accuracy of 55.82% for both training and validation. This is also the case when adding the additional features as the most frequent class in the observed y does not change with additional features.

In [140]:
# Dummy classifier predicting most frequent class
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X_bow_train, y_train)
y_pred = dummy.predict(X_bow_val)
classification_metrics("Dummy classifier", dummy, X_bow_train, X_bow_val, y_train, y_val)

          Dummy classifier
Train Accuracy: 0.5582465277777777
Val Accuracy: 0.558203125
F1 Score: 0.3999359566934069
[[   0 1382    0]
 [   0 4287    0]
 [   0 2011    0]]


In [141]:
# Dummy classifier predicting most frequent class w/ all features
dummy.fit(X_bow_f_train, y_f_train)
y_pred = dummy.predict(X_bow_f_val)
classification_metrics("Dummy classifier", dummy, X_bow_f_train, X_bow_f_val, y_f_train, y_f_val)

          Dummy classifier
Train Accuracy: 0.5582465277777777
Val Accuracy: 0.558203125
F1 Score: 0.3999359566934069
[[   0 1382    0]
 [   0 4287    0]
 [   0 2011    0]]


From the dummy classifier, the next step to test was the TF-IDF data. This was done using a logistic regression as that provides a discrete outcome. This performs significantly better than the dummy classifier with an accuracy of 65% on the validation set. When adding the extra features troubles arise

In [142]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver="saga")
lr.fit(X_tfidf_train, y_train)
classification_metrics("LogisticRegression tf-idf", lr, X_tfidf_train, X_tfidf_val, y_train, y_val)

      LogisticRegression tf-idf
Train Accuracy: 0.7212962962962963
Val Accuracy: 0.6486979166666667
F1 Score: 0.6280045725482722
[[ 481  852   49]
 [ 230 3621  436]
 [  32 1099  880]]


In [144]:
lr = LogisticRegression(solver="saga", max_iter=2000)
lr.fit(X_tfidf_f_train, y_f_train)
classification_metrics("LogisticRegression tf-idf", lr, X_tfidf_f_train, X_tfidf_f_val, y_f_train, y_f_val)

      LogisticRegression tf-idf
Train Accuracy: 0.5582465277777777
Val Accuracy: 0.558203125
F1 Score: 0.3999359566934069
[[   0 1382    0]
 [   0 4287    0]
 [   0 2011    0]]




Finally, xgboost is employed to see what a Gradient Boosting can do to classify the price points of the establishments. 

In [145]:
import xgboost
xgb = xgboost.XGBClassifier(eval_metric="mlogloss", use_label_encoder=False)
xgb.fit(X_tfidf_train, y_train)
classification_metrics("XGBoost tf-idf", xgb, X_tfidf_train, X_tfidf_val, y_train, y_val)

           XGBoost tf-idf
Train Accuracy: 0.5932002314814815
Val Accuracy: 0.5876302083333333
F1 Score: 0.4814103155596096
[[ 144 1231    7]
 [  36 4182   69]
 [   2 1822  187]]


In [146]:
xgb.fit(X_tfidf_f_train, y_f_train)
classification_metrics("XGBoost tf-idf", xgb, X_tfidf_f_train, X_tfidf_f_val, y_f_train, y_f_val)

           XGBoost tf-idf
Train Accuracy: 0.5586805555555555
Val Accuracy: 0.5579427083333334
F1 Score: 0.399816195674885
[[   0 1382    0]
 [   0 4285    2]
 [   0 2011    0]]
