# Improving  a ML model using 3 ways
## 1. By Hand
## 2. By RandomizedSearchCV

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline



# By hand method on Classification Model

In [3]:
heart_disease = pd.read_csv('Data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [4]:
heart_disease.shape

(303, 14)

In [30]:
# Creating a function for evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
np.random.seed(43)

def evaluation(y_true, y_preds):
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    
    metrics_returned = {'Accuracy Score: ': round(accuracy, 2),
                        'Precision: ': round(precision, 2),
                        'recall: ': round(recall, 2),
                        'f1: ': round(f1, 2)}
    
    print(f'The Acuucary is: {accuracy * 100 :.2f}%')
    print(f'Precision Score: {precision * 100 :.2f}')
    print(f'Recall: {recall * 100 :.2f}')
    print(f'F1: {f1 * 100:.2f}')
    
    return metric_returned

In [8]:
# Shuffled the data
heart_disease_shuffled = heart_disease.sample(frac=1)
heart_disease_shuffled.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
218,65,1,0,135,254,0,0,127,0,2.8,1,1,3,0
199,65,1,0,110,248,0,0,158,0,0.6,2,2,1,0
86,68,1,2,118,277,0,1,151,0,1.0,2,1,3,1
130,54,0,2,160,201,0,1,163,0,0.0,2,1,2,1
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3,0


In [9]:
# creating x and y
x = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

# Split this into train, validation and test sets

## setting up limitations
train_split = round(0.7 * len(heart_disease_Shuffled))  # uses the 70% data
valid_split = round(train_split + 0.15 * len(heart_disease_Shuffled)) # uses the 15% data

# split into train, validation and test sets

x_train, y_train = x[:train_split], y[:train_split]
x_valid, y_valid = x[train_split:valid_split], y[train_split:valid_split]
x_test, y_test = x[valid_split: ], y[valid_split: ]





In [10]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier().fit(x_train, y_train)

In [13]:
# make some predictions on valid sets

y_preds = clf.predict(x_valid)


pd.crosstab(y_valid, y_preds)

col_0,0,1
target,Unnamed: 1_level_1,Unnamed: 2_level_1
0,16,7
1,3,19


In [22]:
# evaluate using our evaluation function

Evaluation = evaluation(y_valid, y_preds)

The Acuucary is: 77.78%
Precision Score: 73.08
Recall: 86.36
F1: 79.17


In [23]:
# make some prediction on test sets and checking evaluation
y_preds = clf.predict(x_test)

evaluation(y_test, y_preds)

The Acuucary is: 76.09%
Precision Score: 84.62
Recall: 75.86
F1: 80.00


In [25]:
clf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)>

In [26]:
# improving by passing hyperparameter n_estimatior
clf_2 = RandomForestClassifier(n_estimators=200).fit(x_train, y_train)

y_preds = clf_2.predict(x_valid)

evaluation(y_valid, y_preds)

The Acuucary is: 77.78%
Precision Score: 73.08
Recall: 86.36
F1: 79.17


# By Hand method for Regression Model

In [28]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

boston = load_boston()
boston

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
         4.9800e+00],
        [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
         9.1400e+00],
        [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
         4.0300e+00],
        ...,
        [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         5.6400e+00],
        [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
         6.4800e+00],
        [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
         7.8800e+00]]),
 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
        18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
        15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
        13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
        21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
        35.4, 24.7, 3

In [29]:
boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['target'] = boston['target']
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [53]:
# Create an evaluation function for regression model

def reg_evaluation(y_true, y_preds):
    r2 = r2_score(y_true, y_preds)
    mse = mean_squared_error(y_true, y_preds)
    mae = mean_absolute_error(y_true, y_preds)
    
    metrics = {'R2 Score: ': round(r2, 2),
               'Mean Aslolute Error: ': round(mae, 2),
               'Mean Squared Error: ': round(mse, 2)}
    
    print(f'R2 Score: {r2 * 100:.2f}%........ Mean Squared Error: {mse * 100 :.2f}...... Mean Absolute Error: {mae * 100 :.2f}')
    
    return metrics
    

In [40]:
# create train, validation and test split sets
x = boston_df.drop('target', axis=1)
y = boston_df['target']

train_split = round( 0.70 * len(boston_df))
valid_split = round(train_split + 0.15 * len(boston_df))

x_train, y_train = x[: train_split], y[: train_split]
x_valid, y_valid = x[train_split:valid_split], y[train_split:valid_split]
x_test, y_test = x[valid_split:], y[valid_split:]

model = RandomForestRegressor().fit(x_train, y_train)


In [41]:
model.predict(x_test)

array([16.466, 24.226, 20.567, 16.799, 17.991, 16.769, 16.551, 15.04 ,
       15.619, 15.659, 15.817, 15.897, 17.282, 16.235, 14.725, 15.424,
       16.643, 17.506, 16.995, 16.273, 19.887, 18.189, 17.341, 32.446,
       18.507, 16.711, 16.306, 17.099, 17.211, 19.199, 18.443, 19.211,
       19.849, 21.128, 20.614, 19.419, 16.449, 15.14 , 17.56 , 19.209,
       17.936, 19.762, 20.055, 31.086, 15.483, 15.523, 16.782, 15.143,
       16.694, 19.512, 21.128, 26.86 , 32.251, 20.975, 20.523, 21.599,
       19.407, 21.395, 17.914, 16.669, 16.242, 18.385, 19.834, 19.562,
       20.909, 19.488, 18.167, 19.925, 20.694, 18.905, 19.848, 26.383,
       21.324, 28.75 , 28.361, 20.841])

In [42]:
len(x), len(y)

(506, 506)

In [43]:
len(boston_df)

506

In [44]:
len(heart_disease)

303

In [54]:
# evaluating using our function

y_preds = model.predict(x_valid)

regression = reg_evaluation(y_valid, y_preds)

R2 Score: 19.20%........ Mean Squared Error: 9023.02...... Mean Absolute Error: 684.82


In [55]:
# evaluation on test sets
y_preds = model.predict(x_test)

reg_evaluation(y_test, y_preds)

R2 Score: 17.70%........ Mean Squared Error: 1707.59...... Mean Absolute Error: 313.44


{'R2 Score: ': 0.18,
 'Mean Aslolute Error: ': 3.13,
 'Mean Squared Error: ': 17.08}

In [56]:
# let's do this again with shuffled data and then evaluate this model

np.random.seed(43)
boston_df_shuffled = boston_df.sample(frac=1)

x = boston_df_shuffled.drop('target', axis=1)
y - boston_df_shuffled['target']

train_split = round(0.70 * len(boston_df_shuffled))
valid_split = round(train_split + 0.15 * len(boston_df_shuffled))

x_train, y_train = x[:train_split], y[:train_split]
x_valid, y_valid = x[train_split:valid_split], y[train_split:valid_split]
x_test, y_test = x[valid_split:], y[valid_split:]

model = RandomForestRegressor().fit(x_train, y_train)

y_preds = model.predict(x_valid)
reg_evaluation(y_valid, y_preds)

R2 Score: -69.61%........ Mean Squared Error: 18939.77...... Mean Absolute Error: 1163.73


{'R2 Score: ': -0.7,
 'Mean Aslolute Error: ': 11.64,
 'Mean Squared Error: ': 189.4}

# Tuning Hyperparameters by Randomized Search CV

## For Classification Problem
## For Regression Problem

*1. Classification Problem


In [3]:
# imports for classification model

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
heart_disease = pd.read_csv('Data/heart-disease.csv')
heart_disease.dtypes, heart_disease.shape, heart_disease.head(8)

(age           int64
 sex           int64
 cp            int64
 trestbps      int64
 chol          int64
 fbs           int64
 restecg       int64
 thalach       int64
 exang         int64
 oldpeak     float64
 slope         int64
 ca            int64
 thal          int64
 target        int64
 dtype: object,
 (303, 14),
    age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
 0   63    1   3       145   233    1        0      150      0      2.3      0   
 1   37    1   2       130   250    0        1      187      0      3.5      0   
 2   41    0   1       130   204    0        0      172      0      1.4      2   
 3   56    1   1       120   236    0        1      178      0      0.8      2   
 4   57    0   0       120   354    0        1      163      1      0.6      2   
 5   57    1   0       140   192    0        1      148      0      0.4      1   
 6   56    0   1       140   294    0        0      153      0      1.3      1   
 7   44    1   1      

In [6]:
# create a function to evaluate the classification model
def clf_evaluation(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    metrics = {'Accuracy:': round(accuracy, 2),
                'Precision:': round(precision, 2),
                'Recall:': round(recall, 2),
                'F1:': round(f1, 2)}
               
    print(f'Accuracy: {accuracy * 100 :.2f}%')
    print(f'Precision: {precision * 100 :.2f}')
    print(f'Recall: {recall * 100 :.2f}')
    print(f'F1: {f1 * 100 :.2f}')
               
    return metrics

In [7]:
heart_disease_shuffled = heart_disease.sample(frac=1)
heart_disease_shuffled.tail()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
47,47,1,2,138,257,0,0,156,0,0.0,2,0,2,1
142,42,0,2,120,209,0,1,173,0,0.0,1,0,2,1
67,45,0,1,130,234,0,0,175,0,0.6,1,0,2,1
265,66,1,0,112,212,0,0,132,1,0.1,2,1,2,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0


In [12]:
x = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
        'max_depth': [None, 5, 10, 20, 30],
        'max_features': ['auto', 'sqrt'],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 2, 4]}

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier()

rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=100,
                            verbose=2)


In [8]:
clf = RandomForestClassifier()
clf.get_params

<bound method BaseEstimator.get_params of RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)>

In [14]:
# predictions on rs_classification
rs_clf.fit(x_train, y_train)

y_preds_rs = rs_clf.predict(x_test)
y_preds_rs

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   7.1s
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   4.8s
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   4.3s
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   4.6s
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=4, max_features=auto, max_depth=30, total=   4.7s
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, total=   5.2s
[CV] n_estimators=1200, min_samples_spl

[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, total=   3.7s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, total=   3.9s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, total=   4.0s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, total=   3.9s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=5 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=5, total=   0.0s
[CV] n_estimators=10, min_samples_s

[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, total=   1.8s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, total=   1.9s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, total=   1.8s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, total=   1.7s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=None 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=None, total=   0.5s
[CV] n_estimators=100, min_samples_split=2, min_

[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   0.7s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None 
[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   0.7s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None 
[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   0.7s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None 
[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   0.9s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=5 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=5, total=   0.7s
[CV] n_estimators=200, min_samples_split

[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   4.5s
[CV] n_estimators=1000, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None 
[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   3.9s
[CV] n_estimators=1000, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None 
[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   3.6s
[CV] n_estimators=1000, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None 
[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=2, max_features=sqrt, max_depth=None, total=   4.8s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   0.8s
[CV] n_estimators=200, min_samp

[CV]  n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.4s
[CV] n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.3s
[CV] n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.4s
[CV] n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.4s
[CV] n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30 
[CV]  n_estimators=100, min_samples_split=4, min_samples_leaf=4, max_features=auto, max_depth=30, total=   0.5s
[CV] n_estimators=1000, min_samples_split=4, min_sam

[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=20, total=   0.5s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20, total=   1.0s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20, total=   0.6s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20, total=   0.6s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=20, total=   1.0s
[CV] n_estimators=200, min_samples_split=2, min_samp

[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, total=   5.6s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, total=   5.5s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, total=   4.8s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, total=   4.6s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=20, total=   5.4s
[CV] n_estimators=1200, min_samples_split=6

[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=10, total=   3.9s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None, total=   1.9s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None, total=   1.8s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None, total=   1.7s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=None, total=   1.8s
[CV] n_estimators=500, min_samples_

[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   5.0s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=30, total=   5.0s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=2, max_features=auto, max_depth=5, total=   1.9s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=2, max_features=auto, max_depth=5, total=   2.1s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=2, max_features=auto, max_depth=5, total=   2.1s
[CV] n_estimators=500, min_samples_split=4, min_samples

[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, total=   2.0s
[CV] n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10 
[CV]  n_estimators=500, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, total=   2.4s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   3.9s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   4.3s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=2, max_features=sqrt, max_depth=10, total=   5.0s
[CV] n_estimators=1000, min_samples_split=6, m

[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=None, total=   0.0s
[CV] n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20, total=   4.0s
[CV] n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20, total=   4.0s
[CV] n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20, total=   4.1s
[CV] n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=4, min_samples_leaf=1, max_features=auto, max_depth=20, total=   4.4s
[CV] n_estimators=1000, min_samples_split=4

[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=10, total=   0.4s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   4.9s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   4.4s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   4.5s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=5, total=   5.0s
[CV] n_estimators=1200, min_samples_split=2, min_sam

[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 18.8min finished


array([0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0], dtype=int64)

In [15]:
# let's call our evaluation function to check the evaluation
clf_evaluation(y_test, y_preds_rs)

Accuracy: 83.61%
Precision: 83.78
Recall: 88.57
F1: 86.11


{'Accuracy:': 0.84, 'Precision:': 0.84, 'Recall:': 0.89, 'F1:': 0.86}

In [17]:
# Create test, validation and test sets and then check it on that sets

x = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
        'max_depth': [None, 5, 10, 20, 30],
        'max_features': ['auto', 'sqrt'],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 2, 4]}

train_split = round(0.70 * len(heart_disease_shuffled))
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled))
x_train, y_train = x[:train_split], y[:train_split]
x_valid, y_valid = x[train_split:valid_split], y[train_split:valid_split]
x_test, y_test = x[valid_split:], y[valid_split:]

clf = RandomForestClassifier()

rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=20,
                            cv=6,
                            verbose=3)
rs_clf.fit(x_train, y_train)

Fitting 6 folds for each of 20 candidates, totalling 120 fits
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.750, total=   3.2s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.1s remaining:    0.0s


[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.750, total=   3.8s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.9s remaining:    0.0s


[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.771, total=   4.0s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.886, total=   4.4s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.886, total=   4.1s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.857, total=   4.5s
[CV] n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=20 
[CV]  n_estimators=1000, min_samples_split=6, min_samples_leaf=1, max_features=auto, max_depth=20, score

[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, score=0.886, total=   5.0s
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, score=0.886, total=   4.9s
[CV] n_estimators=1200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=1200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=None, score=0.914, total=   4.3s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10, score=0.694, total=   1.6s
[CV] n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10 
[CV]  n_estimators=500, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10,

[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, score=0.743, total=   0.1s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, score=0.829, total=   0.0s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, score=0.857, total=   0.0s
[CV] n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30 
[CV]  n_estimators=10, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=30, score=0.800, total=   0.0s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=10 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=10, score=0.778, total=

[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5, score=0.886, total=   0.7s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5, score=0.886, total=   0.8s
[CV] n_estimators=200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=200, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5, score=0.914, total=   0.8s


[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  4.1min finished


RandomizedSearchCV(cv=6, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

In [18]:
# Evaluation on validation sets
y_preds_rs = rs_clf.predict(x_valid)

# challing our own function
clf_evaluation(y_preds_rs, y_valid);

Accuracy: 80.00%
Precision: 90.91
Recall: 74.07
F1: 81.63


In [19]:
# chekcing this on test sets
y_preds = rs_clf.predict(x_test)

clf_evaluation(y_preds, y_test);

Accuracy: 91.30%
Precision: 96.15
Recall: 89.29
F1: 92.59


## Hyperparameters Tuning for Regression Model

In [20]:
# Basic imports 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# load the boston data for a regression problem
from sklearn.datasets import load_boston

In [21]:
# Getting our data ready
boston = load_boston()
boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])
boston_df['target'] = boston['target']
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [22]:
# Create an evaluation function
def reg_evaluation(y_true, y_preds):
    r2 = r2_score(y_true, y_preds)
    mse = mean_squared_error(y_true, y_preds)
    mae = mean_absolute_error(y_true, y_preds)
    
    metrics = {'R2': round(r2, 2),
               'Mean Squared Error': round(mse, 2),
               'Mean Absolute Error': round(mae, 2)}
    print(f'R2 Score: {r2 * 100 :.2f}%')
    print(f'Mean Squared Error: {mse * 100 :.2f}')
    print(f'Mean Absolute Error: {mae * 100 :.2f}')
    
    return metrics

In [23]:
# split into train, validation and test splits

boston_df_shuffled = boston_df.sample(frac=1)

## split into features and labels
x = boston_df_shuffled.drop('target', axis=1)
y = boston_df_shuffled['target']

train_split = round(0.70 * len(boston_df_shuffled))
valid_split = round(train_split + 0.15 * len(boston_df_shuffled))

x_train, y_train = x[:train_split], y[:train_split]
x_valid, y_valid = x[train_split:valid_split], y[train_split:valid_split]
x_test, y_test = x[valid_split:], y[valid_split:]


In [24]:
# setting up grid and model and tuning this with RandomizedSearchCV
grid = {'n_estimators': [10, 100, 200, 500, 1000, 1200],
        'max_depth': [None, 5, 10, 20, 30],
        'max_features': ['auto', 'sqrt'],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 2, 4]}
model = RandomForestRegressor()
model_rs = RandomizedSearchCV(estimator=model,
                              param_distributions=grid,
                              n_iter=20,
                              cv=6,
                              verbose=3)

# fit this model
model_rs.fit(x_train, y_train);

Fitting 6 folds for each of 20 candidates, totalling 120 fits
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, score=0.812, total=   1.3s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, score=0.760, total=   0.7s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.0s remaining:    0.0s


[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, score=0.898, total=   0.9s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10 
[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, score=0.895, total=   0.8s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10 
[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, score=0.900, total=   0.8s
[CV] n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10 
[CV]  n_estimators=200, min_samples_split=4, min_samples_leaf=1, max_features=sqrt, max_depth=10, score=0.797, total=   1.2s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, score=0.768, t

[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30, score=0.891, total=   4.6s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30, score=0.909, total=   4.5s
[CV] n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30 
[CV]  n_estimators=1200, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=30, score=0.807, total=   5.5s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=None, score=0.803, total=   1.0s
[CV] n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=None 
[CV]  n_estimators=200, min_samples_split=2, min_samples_leaf=2, max_features=auto, max_depth=None, s

[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, score=0.904, total=   0.4s
[CV] n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10 
[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=2, max_features=sqrt, max_depth=10, score=0.813, total=   0.3s
[CV] n_estimators=500, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=500, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5, score=0.798, total=   2.2s
[CV] n_estimators=500, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=500, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5, score=0.755, total=   2.3s
[CV] n_estimators=500, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5 
[CV]  n_estimators=500, min_samples_split=6, min_samples_leaf=2, max_features=auto, max_depth=5, score=0.880, total= 

[CV]  n_estimators=100, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=None, score=0.747, total=   0.6s
[CV] n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.789, total=   1.7s
[CV] n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.750, total=   1.8s
[CV] n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.889, total=   1.9s
[CV] n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20 
[CV]  n_estimators=500, min_samples_split=6, min_samples_leaf=1, max_features=sqrt, max_depth=20, score=0.887,

[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:  4.8min finished


In [28]:
# checking the best parameters chosen by RandomizedSearchCV
model_rs.best_params_

{'n_estimators': 1200,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 30}

In [29]:
# Evaluate our model on validation sets
y_preds = model_rs.predict(x_valid)

reg_evaluation(y_valid, y_preds);

R2 Score: 85.28%
Mean Squared Error: 960.66
Mean Absolute Error: 211.97


In [30]:
# Evaluate on test sets
y_preds = model_rs.predict(x_test)

reg_evaluation(y_test, y_preds);

R2 Score: 84.55%
Mean Squared Error: 1433.95
Mean Absolute Error: 252.04


In [33]:
# checking the accuracy of model and hyperparameters model on validation set
model.fit(x_test, y_test)

print(f'The Accuracy score on validation set of Regression Model is: {model.score(x_valid, y_valid) * 100 :.2f}%')
print(f'The Accuracy score on Validation of RandomizedSearchCV model is: {model_rs.score(x_valid, y_valid) * 100 :.2f}%')

The Accuracy score on validation set of Regression Model is: 68.25%
The Accuracy score on Validation of RandomizedSearchCV model is: 85.28%


In [34]:
# checking the Accuracy of both models on test sets

print(f'The Accuracy score on Test set of Regression Model is: {model.score(x_test, y_test) * 100 :.2f}%')
print(f'The Accuracy score on Test of RandomizedSearchCV model is: {model_rs.score(x_test, y_test) * 100 :.2f}%')

The Accuracy score on Test set of Regression Model is: 97.68%
The Accuracy score on Test of RandomizedSearchCV model is: 84.55%


In [35]:
'[4]'

'[4]'

In [36]:
[4]

[4]

In [37]:
[]

[]