# Predicting Yelp Rating from Health Inspection Scores by Restaurant

The first analysis point that the team agreed upon was to see if there is a correlation between the Boulder County Health Inspection Scores and Yelp or Google Ratings for restaurants in Boulder county, Colorado.

The Boulder County Health Inspections Scores were obtained. The features selected for the first analysis were the Health Inspection Score, Facility Type and Facility Category. These features were used to train the model in trying to predict the Yelp Rating per facility. 

The first step in engineering the features for the machine learning model used the filtered dataset to:
* Eliminate all location data so as not to overburden the model
* Average the inspection scores for all routine and regular health inspections by facility
    * This was difficult to eliminate the duplicate rows without losing details (pivot table and merge)
* Bin the averaged health inspection scores to match the Health Department ratings
* Create randomized Yelp Ratings to test the model
* Use Random Forest model as it is fast, simple and flexible 
    * Easy to use during the initial model development process, to see how it performs
    * Provides a good indicator of the importance it assigns to features
    * Limitations include: fast to train, but quite slow to create predictions once they are trained
    
* May need to switch to a neural network, for the second phase which has a lot of different feature types

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import tensorflow as tf

# Import our input datasets
ave_score_df=pd.read_csv('db_average_health_scores_bucketed.csv')

ave_score_df. drop("averageInspectionScore", axis=1, inplace=True)
ave_score_df

Unnamed: 0,facilityId,typeOfFacility,categoryOfFacility,healthScore
0,FA0003323,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5
1,FA0000616,RESTAURANT MORE THAN 200 SEATS,FULL SERVICE FULL MENU,3
2,FA0004494,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,4
3,FA0003893,SPECIAL EVENT,SPECIAL EVENT,5
4,FA0003472,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5
...,...,...,...,...
1216,FA0005424,GROCERY STORE 0 TO 15000 SQ FT,GROCERY FINISHED FOODS,5
1217,FA0005494,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5
1218,FA0005326,NO FEE LICENSE K12 SCHOOLS NON PROFIT,FOOD BANK,5
1219,FA0004973,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5


In [12]:
ave_score_df["facilityId"].nunique()

1221

In [20]:
ave_score_df.dtypes

facilityId            object
typeOfFacility        object
categoryOfFacility    object
healthScore            int64
dtype: object

In [21]:
# Add Google ratings - going to need to .join
ave_score_df["googleRating"]=np.random.randint(1,6, size=len(ave_score_df))
ave_score_df

Unnamed: 0,facilityId,typeOfFacility,categoryOfFacility,healthScore,googleRating
0,FA0003323,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,1
1,FA0000616,RESTAURANT MORE THAN 200 SEATS,FULL SERVICE FULL MENU,3,1
2,FA0004494,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,4,1
3,FA0003893,SPECIAL EVENT,SPECIAL EVENT,5,1
4,FA0003472,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,5
...,...,...,...,...,...
1216,FA0005424,GROCERY STORE 0 TO 15000 SQ FT,GROCERY FINISHED FOODS,5,2
1217,FA0005494,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,2
1218,FA0005326,NO FEE LICENSE K12 SCHOOLS NON PROFIT,FOOD BANK,5,2
1219,FA0004973,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,5


In [22]:
# Bin the googleRating integers to categories "5"= 5, "4" = 4, "3" = 3, "2" = 2, "1" = 1.
bins=[0, 1, 2, 3, 4, 5]
google_ratings = ["1", "2", "3", "4", "5"]
ave_score_df["googleRating"]=pd.cut(ave_score_df["googleRating"], bins, include_lowest=True, labels=google_ratings)
ave_score_df

Unnamed: 0,facilityId,typeOfFacility,categoryOfFacility,healthScore,googleRating
0,FA0003323,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,1
1,FA0000616,RESTAURANT MORE THAN 200 SEATS,FULL SERVICE FULL MENU,3,1
2,FA0004494,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,4,1
3,FA0003893,SPECIAL EVENT,SPECIAL EVENT,5,1
4,FA0003472,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,5
...,...,...,...,...,...
1216,FA0005424,GROCERY STORE 0 TO 15000 SQ FT,GROCERY FINISHED FOODS,5,2
1217,FA0005494,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,2
1218,FA0005326,NO FEE LICENSE K12 SCHOOLS NON PROFIT,FOOD BANK,5,2
1219,FA0004973,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,5


In [23]:
feature_df= ave_score_df.drop(columns=["facilityId"], axis=1)
feature_df

Unnamed: 0,typeOfFacility,categoryOfFacility,healthScore,googleRating
0,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,1
1,RESTAURANT MORE THAN 200 SEATS,FULL SERVICE FULL MENU,3,1
2,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,4,1
3,SPECIAL EVENT,SPECIAL EVENT,5,1
4,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,5
...,...,...,...,...
1216,GROCERY STORE 0 TO 15000 SQ FT,GROCERY FINISHED FOODS,5,2
1217,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,2
1218,NO FEE LICENSE K12 SCHOOLS NON PROFIT,FOOD BANK,5,2
1219,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,5


In [11]:
#ave_score_df.to_csv('average_scores.csv')

# Machine Learning Model Comparisons

In [12]:
# Preparing our data for ML
# Generate our categorical variable list
fac_cat = feature_df.dtypes[feature_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
feature_df[fac_cat].nunique()

typeOfFacility        13
categoryOfFacility    13
dtype: int64

In [13]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(feature_df[fac_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(fac_cat)
encode_df

Unnamed: 0,typeOfFacility_GROCERY STORE MORE THAN 15000 SQ FT,typeOfFacility_GROCERY STORE 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI MORE THAN 15000 SQ FT,typeOfFacility_LIMITED FOOD SERVICE CONVENIENCE OTHER,typeOfFacility_MOBILE UNIT FULL FOOD SERVICE,typeOfFacility_MOBILE UNIT PREPACKAGED,typeOfFacility_NO FEE LICENSE K12 SCHOOLS NON PROFIT,typeOfFacility_RESTAURANT 0 TO 100 SEATS,typeOfFacility_RESTAURANT 101 TO 200 SEATS,...,categoryOfFacility_FOOD BANK,categoryOfFacility_FULL MENU LIMITED SERVICE,categoryOfFacility_FULL SERVICE FULL MENU,categoryOfFacility_GROCERY FINISHED FOODS,categoryOfFacility_MOBILE UNITS,categoryOfFacility_PRE PACKAGED,categoryOfFacility_RESIDENTIAL FACILITIES,categoryOfFacility_RETAIL COMMISSARY,categoryOfFacility_SPECIAL EVENT,categoryOfFacility_TEMPORARY EVENTS
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1216,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1218,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
# Merge one-hot encoded features and drop the originals
feature_df = feature_df.merge(encode_df, left_index=True, right_index=True)
feature_df = feature_df.drop(fac_cat,1)
feature_df.head()

  feature_df = feature_df.drop(fac_cat,1)


Unnamed: 0,healthScore,yelpRating,typeOfFacility_GROCERY STORE MORE THAN 15000 SQ FT,typeOfFacility_GROCERY STORE 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI MORE THAN 15000 SQ FT,typeOfFacility_LIMITED FOOD SERVICE CONVENIENCE OTHER,typeOfFacility_MOBILE UNIT FULL FOOD SERVICE,typeOfFacility_MOBILE UNIT PREPACKAGED,typeOfFacility_NO FEE LICENSE K12 SCHOOLS NON PROFIT,...,categoryOfFacility_FOOD BANK,categoryOfFacility_FULL MENU LIMITED SERVICE,categoryOfFacility_FULL SERVICE FULL MENU,categoryOfFacility_GROCERY FINISHED FOODS,categoryOfFacility_MOBILE UNITS,categoryOfFacility_PRE PACKAGED,categoryOfFacility_RESIDENTIAL FACILITIES,categoryOfFacility_RETAIL COMMISSARY,categoryOfFacility_SPECIAL EVENT,categoryOfFacility_TEMPORARY EVENTS
0,5,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,5,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Remove Google outcome target from features data
y = feature_df.googleRating
X = feature_df.drop(columns="googleRating")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [16]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.212


In [None]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Add the prediction to the dataframe
# ave_score_df["yelpPrediction"]= y_pred
# ave_score_df.head()