In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV


import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv('../X_train_transformed.csv').drop(columns= ['Unnamed: 0'])
y_train = pd.read_csv('../y_train_transformed.csv').drop(columns= ['Unnamed: 0'])
X_test = pd.read_csv('../X_test_transformed.csv').drop(columns= ['Unnamed: 0'])
y_test = pd.read_csv('../y_test_transformed.csv').drop(columns= ['Unnamed: 0'])

In [3]:
ridge = Ridge()

parameters = {'alpha':np.arange(0.5,20,0.5)}

gs = GridSearchCV(ridge, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse ridge:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.04918027496566302
grid search best params:  {'alpha': 19.5}
rmse ridge: 0.047742447006479914


In [4]:
lasso = Lasso()
parameters = {'alpha':np.arange(0.5,20,0.5)}

gs = GridSearchCV(lasso, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse lasso:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.08600141924757151
grid search best params:  {'alpha': 0.5}
rmse lasso: 0.08711175580305142


In [5]:
random_forest = RandomForestRegressor()
parameters = {'n_estimators':[10,25,50,100],
             'max_depth':[3,5,7],
             'min_samples_split':[2,4,6]}

gs = GridSearchCV(random_forest, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse random_forest:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.048157958520686765
grid search best params:  {'max_depth': 7, 'min_samples_split': 4, 'n_estimators': 50}
rmse random_forest: 0.047426358690265985


In [6]:
knn = KNeighborsRegressor()
parameters = {'n_neighbors':np.arange(1,20,1)}

gs = GridSearchCV(knn, parameters, scoring='neg_root_mean_squared_error',cv=5)

#fit the grid search
gs.fit(X_train,y_train)

# best estimator
print('grid search best score: ',gs.best_score_)
print('grid search best params: ',gs.best_params_)

# best model
best_model = gs.best_estimator_
best_model.fit(X_train,y_train)
print('rmse knn:',mean_squared_error(y_test, best_model.predict(X_test), squared=False))


grid search best score:  -0.05589610623237947
grid search best params:  {'n_neighbors': 11}
rmse knn: 0.05575787155876405


In [7]:
cat = CatBoostRegressor(verbose = False)
grid = {'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6],
        'l2_leaf_reg': [1, 3, 5, 7, 9]}

grid_search_result = cat.grid_search(grid, X=X_train, y=y_train,verbose = 0)


bestTest = 0.04601301569
bestIteration = 990


bestTest = 0.04565595762
bestIteration = 833


bestTest = 0.04611859442
bestIteration = 998


bestTest = 0.04575274661
bestIteration = 679


bestTest = 0.04602055123
bestIteration = 999


bestTest = 0.04574837722
bestIteration = 583


bestTest = 0.04605584878
bestIteration = 998


bestTest = 0.04562801007
bestIteration = 988


bestTest = 0.04607403297
bestIteration = 999


bestTest = 0.04579990102
bestIteration = 879


bestTest = 0.04514004563
bestIteration = 983


bestTest = 0.04515606051
bestIteration = 449


bestTest = 0.04517759413
bestIteration = 996


bestTest = 0.044883324
bestIteration = 711


bestTest = 0.04523288025
bestIteration = 999


bestTest = 0.04501891504
bestIteration = 675


bestTest = 0.04523162759
bestIteration = 995


bestTest = 0.04502696332
bestIteration = 502


bestTest = 0.0451170157
bestIteration = 996


bestTest = 0.04502805806
bestIteration = 623


bestTest = 0.04453793346
bestIteration = 809


bestTest = 0.04

In [9]:
grid_search_result['params']

{'depth': 6, 'l2_leaf_reg': 5, 'learning_rate': 0.1}

In [10]:
cat = CatBoostRegressor(verbose = False,depth = 6,l2_leaf_reg = 5,learning_rate = 0.1)
cat.fit(X_train,y_train)
print('rmse cat:',mean_squared_error(y_test, cat.predict(X_test), squared=False))


rmse cat: 0.046672229011035626
