In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/digipodium/Datasets/main/house_pricing.csv")
df.info()
df.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 814 entries, 0 to 813
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   City        814 non-null    object 
 1   Type        814 non-null    object 
 2   Beds        814 non-null    int64  
 3   Baths       814 non-null    int64  
 4   SquareFeet  814 non-null    int64  
 5   Price       814 non-null    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 38.3+ KB


Unnamed: 0,City,Type,Beds,Baths,SquareFeet,Price
0,SACRAMENTO,Residential,2,1,836,138159.85
1,SACRAMENTO,Residential,3,1,1167,167541.46


In [3]:
# Feature selection (X, y) & Split into xtrain,xtest,ytrain,ytest
X = df[['Beds','Baths','SquareFeet']]
y = df['Price']
xtrain,xtest,ytrain,ytest = train_test_split(X,y,test_size=.2, random_state=1)

In [4]:
print("Random Forest")
model2 = RandomForestRegressor()
model2.fit(xtrain,ytrain)
print("score:", model2.score(xtest,ytest) * 100)
pred = model2.predict(X)
print("mse:",mean_squared_error(y,pred))
print("mae:",mean_absolute_error(y,pred))

Random Forest
score: 72.51121232703139
mse: 851378491.5324749
mae: 15020.330110296012


In [5]:
forest_score = cross_val_score(model2,X,y,cv=6)
print(forest_score, f"average: {forest_score.mean():.2f}",f"std :{forest_score.std():.2f}")

[0.82342232 0.69673354 0.71594594 0.72996641 0.48924971 0.61877197] average: 0.68 std :0.10


#### Grid search

In [6]:
# we are going to create a dictionary with all the parameer and their value options
RandomForestRegressor?

[1;31mInit signature:[0m
[0mRandomForestRegressor[0m[1;33m([0m[1;33m
[0m    [0mn_estimators[0m[1;33m=[0m[1;36m100[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mcriterion[0m[1;33m=[0m[1;34m'squared_error'[0m[1;33m,[0m[1;33m
[0m    [0mmax_depth[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_split[0m[1;33m=[0m[1;36m2[0m[1;33m,[0m[1;33m
[0m    [0mmin_samples_leaf[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m[1;33m
[0m    [0mmin_weight_fraction_leaf[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mmax_features[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mmax_leaf_nodes[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mmin_impurity_decrease[0m[1;33m=[0m[1;36m0.0[0m[1;33m,[0m[1;33m
[0m    [0mbootstrap[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0moob_score[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m

In [7]:
params = {
    'n_estimators' : list(range(100,501,100)),
    'criterion': ["squared_error", "absolute_error", "poisson"],
    'max_depth': list(range(5,51,15)),
}
params

{'n_estimators': [100, 200, 300, 400, 500],
 'criterion': ['squared_error', 'absolute_error', 'poisson'],
 'max_depth': [5, 20, 35, 50]}

In [8]:
grid = GridSearchCV(estimator=RandomForestRegressor(),param_grid=params,cv=3,n_jobs=-1,verbose=3)

In [9]:
grid.fit(X,y)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'criterion': ['squared_error', 'absolute_error',
                                       'poisson'],
                         'max_depth': [5, 20, 35, 50],
                         'n_estimators': [100, 200, 300, 400, 500]},
             verbose=3)

In [10]:
gf = pd.DataFrame(grid.cv_results_)

In [11]:
gf.sort_values(by='rank_test_score',inplace=True)
gf

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
20,1.012813,0.01962,0.026523,0.005991,absolute_error,5,100,"{'criterion': 'absolute_error', 'max_depth': 5...",0.82607,0.767864,0.612239,0.735391,0.090266,1
22,3.088422,0.042172,0.059432,0.007507,absolute_error,5,300,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823905,0.76779,0.614332,0.735343,0.088581,2
24,5.112713,0.060127,0.109464,0.005672,absolute_error,5,500,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823937,0.766405,0.613971,0.734771,0.088589,3
23,4.095582,0.022179,0.084017,0.002709,absolute_error,5,400,"{'criterion': 'absolute_error', 'max_depth': 5...",0.823662,0.766941,0.613213,0.734605,0.088906,4
21,2.021134,0.015032,0.037368,0.003517,absolute_error,5,200,"{'criterion': 'absolute_error', 'max_depth': 5...",0.821944,0.764764,0.616701,0.73447,0.086485,5
1,0.328383,0.00325,0.034206,0.017298,squared_error,5,200,"{'criterion': 'squared_error', 'max_depth': 5,...",0.803586,0.769613,0.618893,0.730698,0.080265,6
0,0.155153,0.004086,0.016621,6.5e-05,squared_error,5,100,"{'criterion': 'squared_error', 'max_depth': 5,...",0.800641,0.765043,0.624529,0.730071,0.076031,7
2,0.541227,0.013261,0.033145,0.00394,squared_error,5,300,"{'criterion': 'squared_error', 'max_depth': 5,...",0.799904,0.767002,0.618459,0.728455,0.07893,8
3,0.685433,0.012118,0.047521,0.005722,squared_error,5,400,"{'criterion': 'squared_error', 'max_depth': 5,...",0.803218,0.765779,0.613669,0.727556,0.081968,9
4,0.785128,0.016114,0.052771,0.003663,squared_error,5,500,"{'criterion': 'squared_error', 'max_depth': 5,...",0.800755,0.767255,0.614356,0.727455,0.081134,10


In [12]:
grid.best_estimator_

RandomForestRegressor(criterion='absolute_error', max_depth=5)