In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46


In [3]:
# Feature selection (X, y) & Split into xtrain,xtest,ytrain,ytest
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2, random_state=1)

In [4]:
print("Random Forest")
model2 = RandomForestRegressor()
model2.fit(xtrain,ytrain)
print("score:", model2.score(xtest,ytest) * 100)
pred = model2.predict(X)
print("mse:",mean_squared_error(y,pred))
print("mae:",mean_absolute_error(y,pred))

Random Forest
score: 71.62391918777087
mse: 866343368.9482411
mae: 15221.526921482024


In [5]:
forest_score = cross_val_score(model2,X,y,cv=6)
print(forest_score, f"average: {forest_score.mean():.2f}",f"std :{forest_score.std():.2f}")

[0.81862302 0.6964869  0.71435156 0.72747654 0.49508033 0.61765737] average: 0.68 std :0.10


#### Grid search

In [15]:
# we are going to create a dictionary with all the parameter and their value options
RandomForestRegressor?

[1;31mInit signature:[0m
[0mRandomForestRegressor[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'squared_error'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m

In [7]:
params = {
    'n_estimators' : list(range(100,501,100)),
    'criterion': ["squared_error", "absolute_error", "poisson"],
    'max_depth': list(range(5,51,15)),
}
params

{'n_estimators': [100, 200, 300, 400, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 20, 35, 50]}

In [8]:
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=3)

In [9]:
grid.fit(X,y)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 20, 35, 50],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=3)

In [10]:
gf = pd.DataFrame(grid.cv_results_)

In [11]:
gf.sort_values(by='rank_test_score',inplace=True)
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
22,1.321258,0.019851,0.025701,0.001837,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824635,0.767305,0.615423,0.735787,0.08827,1
23,1.696276,0.032788,0.033379,0.004919,absolute_error,5,400,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824244,0.768724,0.614224,0.73573,0.088857,2
24,2.099109,0.010706,0.047158,0.005667,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.824561,0.766977,0.614059,0.735199,0.088826,3
21,0.95536,0.026075,0.017185,0.000885,absolute_error,5,200,"{'criterion': 'absolute_error', 'max_depth': 5...",0.821285,0.766691,0.615682,0.734552,0.086959,4
20,0.545318,0.030475,0.009106,0.001156,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.821138,0.768917,0.611419,0.733824,0.089141,5
0,0.153435,0.003768,0.013328,0.00377,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.794699,0.7692,0.625611,0.729837,0.07443,6
3,0.661513,0.032064,0.0466,0.004255,squared_error,5,400,"{'criterion': 'squared_error', 'max_depth': 5,...",0.800966,0.768995,0.61353,0.72783,0.081869,7
2,0.463291,0.013918,0.034786,0.004545,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.80242,0.767168,0.613418,0.727669,0.082059,8
4,0.846812,0.027572,0.049507,0.003932,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.801999,0.766816,0.61362,0.727478,0.081781,9
1,0.314399,0.012604,0.023476,0.002883,squared_error,5,200,"{'criterion': 'squared_error', 'max_depth': 5,...",0.797648,0.76632,0.612526,0.725498,0.0809,10


In [12]:
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5, n_estimators=300)

In [13]:
from joblib import dump

In [14]:
dump(grid.best_estimator_,"house_pricing_model_73.pkl")

['house_pricing_model_73.pkl']