In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
from pathlib import Path

#Load the data
review = pd.read_csv('Restaurant_Reviews.tsv', sep='\t')

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def clean_text(df_text_column, data):
    corpus = []
    for i in range(0, len(data)):
        text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
        text = text.lower()
        text = text.split()
        ps = PorterStemmer()
        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
        text = ' '.join(text)
        corpus.append(text)
    return corpus

X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
y = review['Liked']# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Creating the pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.pipeline import make_pipeline
np.random.seed(0)
rf_pipe = make_pipeline(vect, rf)
rf_pipe.steps
rf_pipe.fit(X_train, y_train)

y_pred = rf_pipe.predict(X_test)
y_prob = rf_pipe.predict_proba(X_test)

from sklearn import metrics
metrics.accuracy_score(y_test, y_pred) #Accuracy
metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score

[nltk_data] Downloading package stopwords to /home/kiwi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0.8356789409420988

In [12]:
#　use Kernel SHAP to explain test set predictions
import shap
explainer = shap.TreeExplainer(rf)
# shap_values = explainer.shap_values(X_test)

In [14]:
print(X_test)

993                                      present food aw
859                                    worst food servic
298                                     never dine place
553                        guess mayb went night disgrac
672                         sushi lover avoid place mean
                             ...                        
462                                  im az time new spot
356    sadli gordon ramsey steak place shall sharpli ...
2                                     tasti textur nasti
478                    group claim would handl us beauti
695                               went lunch servic slow
Name: Review, Length: 250, dtype: object
