## Standard Library imports

In [54]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore") # ignore, default

We'll use 2 datasets for demonstation purpose
* heart_disease - a classification dataset (predicting whether someone has heart disease or not)
* boston_df - a regression dataset (predicting the median house prices of cities in Boston)

In [55]:
from sklearn.datasets import load_boston
# Classification data
heart_disease = pd.read_csv('https://raw.githubusercontent.com/jeong-wooseok/zero-to-mastery-ml/master/data/heart-disease.csv')

# Regression data
boston = load_boston()
boston_df = pd.DataFrame(boston['data'], columns=boston['feature_names'])

boston_df['target'] = pd.Series(boston['target'])

In [56]:
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


## 1. Get the data ready

In [57]:
# Split data into X & y
X = heart_disease.drop('target', axis = 1)
y = heart_disease['target']

In [58]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
# eXAMPLE USE CASE (REQUIRES x&Y)
X_train, X_test, y_train, y_test = train_test_split(X, y)

## 2. Pick a model / estimator (to suit your problem)
To pick a model we use Scikit-Learning machin learning map.

In [59]:
# random forest classificafier (for classification problems)
from sklearn.ensemble import RandomForestClassifier
# Instantiating a Randome Forest Classifier
clf = RandomForestClassifier()

In [60]:
# Random Forest Regressor 
from sklearn.ensemble import RandomForestRegressor
# Instantiating a Random Forest Regressor
Rgs = RandomForestRegressor

## 3. Fit the model to the data and make a prediction

In [61]:
# All models/estimators have the fit() function built in
clf.fit(X_train,y_train)

# Once fit is called, you can make predictions using predict()
y_preds = clf.predict(X_test)

# You can also predict with probabilities (on classification models)
y_probs = clf.predict_proba(X_test)

# View preds/probs
y_preds, y_probs

(array([1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1,
        1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 1, 1], dtype=int64),
 array([[0.41, 0.59],
        [0.69, 0.31],
        [0.85, 0.15],
        [0.92, 0.08],
        [0.44, 0.56],
        [0.15, 0.85],
        [0.27, 0.73],
        [0.  , 1.  ],
        [0.35, 0.65],
        [0.  , 1.  ],
        [0.55, 0.45],
        [0.01, 0.99],
        [0.15, 0.85],
        [0.13, 0.87],
        [0.03, 0.97],
        [0.25, 0.75],
        [0.93, 0.07],
        [0.74, 0.26],
        [0.49, 0.51],
        [0.67, 0.33],
        [0.96, 0.04],
        [0.2 , 0.8 ],
        [0.69, 0.31],
        [0.83, 0.17],
        [0.95, 0.05],
        [0.41, 0.59],
        [0.58, 0.42],
        [0.99, 0.01],
        [0.13, 0.87],
        [0.19, 0.81],
        [0.05, 0.95],
        [0.66, 0.34],
        [0.18, 0.82],

## 4. Evaluate the model
Every Scikit-Learn model has a default metric which is accesible through the score() function.

However there are a range of different evaluation metrics you can use depending on the model you're using.

A full list of evaluation metrics can be found in the documantation

In [62]:
clf.score(X_train,y_train)

1.0

In [63]:
# All models/estimators have a score() function
clf.score(X_test,y_test)

0.8157894736842105

In [21]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.76      0.81      0.79        27
           1       0.89      0.86      0.88        49

    accuracy                           0.84        76
   macro avg       0.83      0.84      0.83        76
weighted avg       0.85      0.84      0.84        76



In [22]:
confusion_matrix(y_test,y_preds)

array([[22,  5],
       [ 7, 42]], dtype=int64)

In [23]:
accuracy_score(y_test,y_preds)

0.8421052631578947

In [18]:
# Evaluting a model using cross-validation is possible with cross_val_score
from sklearn.model_selection import cross_val_score

# scoring=None means default score() metric is used
print(cross_val_score(estimator=clf,
                      X=X,
                      y=y,
                      cv=5, # use 5-fold cross-validation
                      scoring = None))

# Evaluate a model with a different scoring method
print(cross_val_score(estimator=clf,
                      X=X,
                      y=y,
                      cv=5, # use 5-fold cross-validation
                      scoring = 'precision'))

[0.85245902 0.90163934 0.83606557 0.8        0.76666667]
[0.81081081 0.875      0.83870968 0.84848485 0.76315789]


## 5. Improve a model
Try different amount of a N_estimators

In [27]:
np.random.seed(42)
for i in range(1,100,10):
    print(f'Trying model with {i} estimators...')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on the test set :{round(clf.score(X_test,y_test)*100,2)}%')
    print("")                                    
    

Trying model with 1 estimators...
Model accuracy on the test set :77.63%

Trying model with 11 estimators...
Model accuracy on the test set :77.63%

Trying model with 21 estimators...
Model accuracy on the test set :86.84%

Trying model with 31 estimators...
Model accuracy on the test set :81.58%

Trying model with 41 estimators...
Model accuracy on the test set :86.84%

Trying model with 51 estimators...
Model accuracy on the test set :80.26%

Trying model with 61 estimators...
Model accuracy on the test set :80.26%

Trying model with 71 estimators...
Model accuracy on the test set :85.53%

Trying model with 81 estimators...
Model accuracy on the test set :82.89%

Trying model with 91 estimators...
Model accuracy on the test set :81.58%



## 6. Save a model and load it

In [28]:
# Save the model
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl",'wb'))

In [29]:
# load the model

loaded_model = pickle.load(open("random_forest_model_1.pkl","rb"))
loaded_model.score(X_test,y_test)

0.8157894736842105

In [41]:
import sklearn
sklearn.show_versions()


System:
    python: 3.7.10 (default, Feb 26 2021, 13:06:18) [MSC v.1916 64 bit (AMD64)]
executable: C:\Dev\Miniconda\python.exe
   machine: Windows-10-10.0.22000-SP0

Python dependencies:
          pip: 21.1.3
   setuptools: 52.0.0.post20210125
      sklearn: 1.0.2
        numpy: 1.21.5
        scipy: 1.7.3
       Cython: None
       pandas: 1.3.5
   matplotlib: 3.5.1
       joblib: 1.1.0
threadpoolctl: 3.1.0

Built with OpenMP: True


In [50]:
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}