In [22]:
import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv('../X_train_transformed_reduced.csv').drop(columns= ['Unnamed: 0'])
y_train = pd.read_csv('../y_train_transformed_reduced.csv').drop(columns= ['Unnamed: 0'])
X_test = pd.read_csv('../X_test_transformed_reduced.csv').drop(columns= ['Unnamed: 0'])
y_test = pd.read_csv('../y_test_transformed_reduced.csv').drop(columns= ['Unnamed: 0'])

In [18]:
ridge = Ridge()

parameters = {'alpha':np.arange(0.5,20,0.5)}

gs = GridSearchCV(ridge, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse ridge:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.05015721517461735
grid search best params:  {'alpha': 14.5}
rmse ridge: 0.04930850566748091


In [19]:
lasso = Lasso()
parameters = {'alpha':np.arange(0.5,20,0.5)}

gs = GridSearchCV(lasso, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse lasso:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.08600141924757151
grid search best params:  {'alpha': 0.5}
rmse lasso: 0.08711175580305142


In [23]:
random_forest = RandomForestRegressor()
parameters = {'n_estimators':[10,25,50,100],
             'max_depth':[3,5,7],
             'min_samples_split':[2,4,6]}

gs = GridSearchCV(random_forest, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse random_forest:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.049095151580185216
grid search best params:  {'max_depth': 7, 'min_samples_split': 2, 'n_estimators': 100}
rmse random_forest: 0.04839975424079646


In [24]:
knn = KNeighborsRegressor()
parameters = {'n_neighbors':np.arange(1,20,1)}

gs = GridSearchCV(knn, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse knn:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.05397446341053432
grid search best params:  {'n_neighbors': 9}
rmse knn: 0.05400830756960631


In [32]:
cat = CatBoostRegressor(verbose = False)
grid = {'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

grid_search_result = cat.grid_search(grid, X=X_train, y=y_train,verbose = 0)


bestTest = 0.04735425746
bestIteration = 993


bestTest = 0.04730757493
bestIteration = 843


bestTest = 0.04737143368
bestIteration = 994


bestTest = 0.04717644716
bestIteration = 988


bestTest = 0.04737190344
bestIteration = 993


bestTest = 0.04718046602
bestIteration = 897


bestTest = 0.04735155588
bestIteration = 989


bestTest = 0.04720876963
bestIteration = 851


bestTest = 0.04738072607
bestIteration = 990


bestTest = 0.0471687939
bestIteration = 890


bestTest = 0.04658606257
bestIteration = 979


bestTest = 0.04655561601
bestIteration = 375


bestTest = 0.04673404023
bestIteration = 946


bestTest = 0.04654428235
bestIteration = 545


bestTest = 0.04675749884
bestIteration = 960


bestTest = 0.04637043255
bestIteration = 530


bestTest = 0.04677596514
bestIteration = 947


bestTest = 0.04650821245
bestIteration = 796


bestTest = 0.0468531715
bestIteration = 994


bestTest = 0.04668290505
bestIteration = 704


bestTest = 0.04599966272
bestIteration = 966


bestTest = 0.0

In [43]:
grid_search_result['params']

{'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.03}

In [44]:
cat = CatBoostRegressor(verbose = False,depth = 6,l2_leaf_reg = 1,learning_rate = 0.03)
cat.fit(X_train,y_train)
print('rmse cat:',mean_squared_error(y_test, cat.predict(X_test), squared=False))


rmse cat: 0.04821004185538346
