# Predicting Health Inspection Scores from Google Ratings by Restaurant

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import tensorflow as tf

# Import our input datasets
ave_score_df=pd.read_csv('db_average_health_scores_bucketed.csv', index_col=0)
ave_score_df.drop("ave_insp_score", axis=1, inplace=True)
ave_score_df

Unnamed: 0,facilityId,typeOfFacility,categoryOfFacility,healthScore
0,FA0000009,RESTAURANT 0 TO 100 SEATS,BARS FRATERNAL ORGANIZATIONS,4
1,FA0000010,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5
2,FA0000011,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5
3,FA0000015,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,3
4,FA0000017,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,5
...,...,...,...,...
890,FA0005506,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5
891,FA0005508,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5
892,FA0005510,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5
893,FA0005539,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5


In [2]:
ave_score_df.dtypes

facilityId            object
typeOfFacility        object
categoryOfFacility    object
healthScore            int64
dtype: object

In [3]:
# Import the Google ratings.csv
google_df=pd.read_csv('facility_ratings.csv', index_col=0)
google_df

Unnamed: 0_level_0,google_rating,total_ratings
facilityId,Unnamed: 1_level_1,Unnamed: 2_level_1
FA0001543,4.5,323
FA0002027,4.5,1160
FA0002162,4.1,174
FA0001624,4.3,628
FA0003535,4.3,578
...,...,...
FA0001776,4.3,541
FA0005279,4.4,1110
FA0005162,4.4,1106
FA0001660,4.9,59


In [4]:
# Add Google ratings - going to need to .join
ave_score_ratings=pd.merge(ave_score_df, google_df, on='facilityId', how='outer')
ave_score_ratings

Unnamed: 0,facilityId,typeOfFacility,categoryOfFacility,healthScore,google_rating,total_ratings
0,FA0000009,RESTAURANT 0 TO 100 SEATS,BARS FRATERNAL ORGANIZATIONS,4,4.9,15
1,FA0000010,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,4.7,937
2,FA0000011,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,4.3,119
3,FA0000015,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,3,4.7,232
4,FA0000017,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,5,4.3,109
...,...,...,...,...,...,...
890,FA0005506,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,4.7,422
891,FA0005508,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,3.0,6
892,FA0005510,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,4.9,63
893,FA0005539,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,4.8,669


In [5]:
# Bin the googleRating integers to categories "5"= 5, "4" = 4, "3" = 3, "2" = 2, "1" = 1.
bins=[0, 1, 2, 3, 4.4, 5]
google_rated = ["1", "2", "3", "4", "5"]
ave_score_ratings["googleRating"]=pd.cut(ave_score_ratings["google_rating"], bins, include_lowest=True, labels=google_rated)
ave_score_ratings

Unnamed: 0,facilityId,typeOfFacility,categoryOfFacility,healthScore,google_rating,total_ratings,googleRating
0,FA0000009,RESTAURANT 0 TO 100 SEATS,BARS FRATERNAL ORGANIZATIONS,4,4.9,15,5
1,FA0000010,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,4.7,937,5
2,FA0000011,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,4.3,119,4
3,FA0000015,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,3,4.7,232,5
4,FA0000017,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,5,4.3,109,4
...,...,...,...,...,...,...,...
890,FA0005506,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,4.7,422,5
891,FA0005508,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,3.0,6,3
892,FA0005510,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,4.9,63,5
893,FA0005539,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,4.8,669,5


In [6]:
feature_df= ave_score_ratings.drop(columns=["facilityId", "total_ratings", "google_rating"])
feature_df

Unnamed: 0,typeOfFacility,categoryOfFacility,healthScore,googleRating
0,RESTAURANT 0 TO 100 SEATS,BARS FRATERNAL ORGANIZATIONS,4,5
1,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,5
2,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,4
3,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,3,5
4,RESTAURANT 0 TO 100 SEATS,FULL SERVICE FULL MENU,5,4
...,...,...,...,...
890,RESTAURANT 0 TO 100 SEATS,FULL MENU LIMITED SERVICE,5,5
891,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,3
892,LIMITED FOOD SERVICE CONVENIENCE OTHER,FAST FOOD LIMITED MENU,5,5
893,RESTAURANT 0 TO 100 SEATS,FAST FOOD LIMITED MENU,5,5


In [7]:
#ave_score_df.to_csv('average_scores.csv')

# Machine Learning Model Comparisons

In [8]:
# Preparing our data for ML
# Generate our categorical variable list
fac_cat = feature_df.dtypes[feature_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
feature_df[fac_cat].nunique()

typeOfFacility        9
categoryOfFacility    6
dtype: int64

In [9]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(feature_df[fac_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(fac_cat)
encode_df

Unnamed: 0,typeOfFacility_GROCERY STORE 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI MORE THAN 15000 SQ FT,typeOfFacility_LIMITED FOOD SERVICE CONVENIENCE OTHER,typeOfFacility_MOBILE UNIT FULL FOOD SERVICE,typeOfFacility_NO FEE LICENSE K12 SCHOOLS NON PROFIT,typeOfFacility_RESTAURANT 0 TO 100 SEATS,typeOfFacility_RESTAURANT 101 TO 200 SEATS,typeOfFacility_RESTAURANT MORE THAN 200 SEATS,categoryOfFacility_BARS FRATERNAL ORGANIZATIONS,categoryOfFacility_FAST FOOD LIMITED MENU,categoryOfFacility_FULL MENU LIMITED SERVICE,categoryOfFacility_FULL SERVICE FULL MENU,categoryOfFacility_MOBILE UNITS,categoryOfFacility_RETAIL COMMISSARY
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
891,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
892,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
893,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [10]:
# Merge one-hot encoded features and drop the originals
feature_df = feature_df.merge(encode_df, left_index=True, right_index=True)
feature_df = feature_df.drop(fac_cat,1)
feature_df.head()

  feature_df = feature_df.drop(fac_cat,1)


Unnamed: 0,healthScore,googleRating,typeOfFacility_GROCERY STORE 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI 0 TO 15000 SQ FT,typeOfFacility_GROCERY STORE W DELI MORE THAN 15000 SQ FT,typeOfFacility_LIMITED FOOD SERVICE CONVENIENCE OTHER,typeOfFacility_MOBILE UNIT FULL FOOD SERVICE,typeOfFacility_NO FEE LICENSE K12 SCHOOLS NON PROFIT,typeOfFacility_RESTAURANT 0 TO 100 SEATS,typeOfFacility_RESTAURANT 101 TO 200 SEATS,typeOfFacility_RESTAURANT MORE THAN 200 SEATS,categoryOfFacility_BARS FRATERNAL ORGANIZATIONS,categoryOfFacility_FAST FOOD LIMITED MENU,categoryOfFacility_FULL MENU LIMITED SERVICE,categoryOfFacility_FULL SERVICE FULL MENU,categoryOfFacility_MOBILE UNITS,categoryOfFacility_RETAIL COMMISSARY
0,4,5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,5,5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3,5,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [11]:
# Remove Google outcome target from features data
y = feature_df.healthScore
X = feature_df.drop(columns="healthScore")

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [12]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Define the logistic regression model
log_classifier = LogisticRegression(solver="lbfgs",max_iter=200)

# Train the model
log_classifier.fit(X_train,y_train)

# Evaluate the model
y_pred = log_classifier.predict(X_test)
print(f" Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Logistic regression model accuracy: 0.451


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  return f(*args, **kwargs)


In [14]:
# Create the SVM model
svm = SVC(kernel='linear')

# Train the model
svm.fit(X_train, y_train)

# Evaluate the model
y_pred = svm.predict(X_test_scaled)
print(f" SVM model accuracy: {accuracy_score(y_test,y_pred):.3f}")

 SVM model accuracy: 0.451


In [15]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.433


In [16]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
7/7 - 0s - loss: -6.5058e+03 - accuracy: 0.0000e+00 - 113ms/epoch - 16ms/step
Loss: -6505.82275390625, Accuracy: 0.0


In [None]:
# Add the prediction to the dataframe
# ave_score_df["googlePrediction"]= y_pred
# ave_score_df.head()