In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# Load the dataset
data = pd.read_csv('cleaned_inspection_data.csv')

# dont need this col anymore
data = data.drop(columns=['camis'])

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195240 entries, 0 to 195239
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   score                195240 non-null  float64
 1   inspection_date      195240 non-null  object 
 2   inspection_type      195240 non-null  object 
 3   latitude             195240 non-null  float64
 4   longitude            195240 non-null  float64
 5   critical_flag        195240 non-null  object 
 6   violation_code       195240 non-null  object 
 7   action               195240 non-null  object 
 8   cuisine_description  195240 non-null  object 
 9   zipcode              195240 non-null  float64
 10  boro                 195240 non-null  object 
dtypes: float64(4), object(7)
memory usage: 16.4+ MB


## `train test split`

In [4]:
# split features and target
features_data = data.drop(columns='score')
target_data = data['score']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_data, target_data, test_size=0.2, random_state=42)

for i in (X_train, X_test, y_train, y_test):
    print(i.shape)

(156192, 10)
(39048, 10)
(156192,)
(39048,)


## `Baseline Model` 

#### Always predicts the mean Score

In [5]:
# get the mean
mean_score = y_train.mean()
print(f'{mean_score = }')

# Create a baseline model function
def baseline_predict(data):
    return [mean_score] * len(data)

mean_score = 22.896800092194223


In [6]:
# "fit model"
y_pred_bl = baseline_predict(X_test)

# test model performance
mse_bl = mean_squared_error(y_test, y_pred_bl)
mae_bl = mean_absolute_error(y_test, y_pred_bl)
r2_bl = r2_score(y_test, y_pred_bl)

print('Mean Squared Error =', round(mse_bl, 4))
print('Mean Absolute Error =', round(mae_bl, 4))
print('R-Squared =', r2_bl)

Mean Squared Error = 305.3882
Mean Absolute Error = 13.1327
R-Squared = -2.8488382119995492e-06


# create preprocessor for sklearn pipeline

In [36]:
# Identify categorical and numerical columns 
categorical_features = ['boro',
                        'inspection_date', 
                        'inspection_type', 
                        'critical_flag', 
                        'violation_code', 
                        'action', 
                        'cuisine_description', 
                        'zipcode']

numerical_features = ['latitude', 'longitude']

# Preprocessing for numerical data: scaling
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: one-hot encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])



# `Simple Linear Regression`

In [37]:
from sklearn.linear_model import LinearRegression

In [51]:
# Define the model
model_LR = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Train the model
model_LR.fit(X_train, y_train)

# Predict on the test set
y_pred_LR = model_LR.predict(X_test)

In [52]:
# Evaluate the model
mse_LR = mean_squared_error(y_test, y_pred_LR)
mae_LR = mean_absolute_error(y_test, y_pred_LR)
r2_LR = r2_score(y_test, y_pred_LR)

print(f'{mse_LR = }')
print(f'{mae_LR = }')
print(f'{r2_LR = }')

mse_LR = 217.29595403769005
mae_LR = 10.963601201326115
r2_LR = 0.2884579455014856


# `Random Forest Regressor`

In [34]:
# Define the model
model_RF = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('regressor', RandomForestRegressor(
                n_jobs = -1, # set num CPU threads to use, set to -1 to use all available
                # max_depth=10,
                # n_estimators=100,  
                # max_features='sqrt', 
                random_state=42
                )
            )
    ])

# Train the model
model_RF.fit(X_train, y_train)

In [53]:
# Predict on the test set
y_pred_RF = model_RF.predict(X_test)

# Evaluate the model
mse_RF = mean_squared_error(y_test, y_pred_RF)
mae_RF = mean_absolute_error(y_test, y_pred_RF)
r2_RF = r2_score(y_test, y_pred_RF)

print(f'{mse_RF = }')
print(f'{mae_RF = }')
print(f'{r2_RF = }')

mse_RF = 116.66030728507538
mae_RF = 7.395950787389433
r2_RF = 0.6179923593531215


NOTE:
mae of 7 is much better than baseline of 13, 

# `Gradient Boosting Regressor`

In [49]:
model_GB = Pipeline(
    steps=[('preprocessor', preprocessor),
           ('regressor', GradientBoostingRegressor(
                loss = 'absolute_error',
                learning_rate = .001,
                random_state = 1
                )
            )
    ])

# Train the model
model_GB.fit(X_train, y_train)

In [50]:
# Predict on the test set
y_pred_GB = model_GB.predict(X_test)

# Evaluate the model
mse_GB = mean_squared_error(y_test, y_pred_GB)
mae_GB = mean_absolute_error(y_test, y_pred_GB)
r2_GB = r2_score(y_test, y_pred_GB)

print(f'{mse_GB = }')
print(f'{mae_GB = }')
print(f'{r2_GB = }')

mse_GB = 308.1638327905237
mae_GB = 12.58755191127824
r2_GB = -0.00909162196307256
