## 4. evaluating a ML model

three ways to evaluate

1.estimator `score` method

2.the `scoring` parameter

3.problem specific metric function

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [3]:
hd = pd.read_csv('heart-disease.csv')
hd.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [8]:
from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

X = hd.drop('target', axis = 1)
y = hd['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

clf = RandomForestClassifier(n_estimators = 100)

clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### 4.1 evaluating with a score method

In [9]:
clf.score(X_test, y_test)

0.8524590163934426

### do the same for regressor

In [10]:
#import boston housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
boston_df = pd.DataFrame(boston["data"], columns = boston["feature_names"])
boston_df["target"] = pd.Series(boston["target"])
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [12]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = boston_df.drop('target', axis = 1)
y = boston_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

reg = RandomForestRegressor(n_estimators = 100)

reg.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [13]:
reg.score(X_test, y_test)

0.873969014117403

### 4.2 evaluating the model using the  scoring parameter

In [15]:
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier
np.random.seed(42)

X = hd.drop('target', axis = 1)
y = hd['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

clf = RandomForestClassifier(n_estimators = 100)

clf.fit(X_train, y_train);

In [16]:
clf.score(X_test, y_test)

0.8524590163934426

In [20]:
cross_val_score(clf, X, y, cv=5)

array([0.83606557, 0.90163934, 0.81967213, 0.83333333, 0.78333333])

#### SO~~~ what is cross validation

CV will split the dataset into different version, for exp, test size 0.2 will use 1/5 of the total data

to run the test, and CV will use different part of the data to construst the test dataset, if cv = 5

the there will be 5 sets of diffrent test data

In [22]:
np.random.seed(42)

# single training and test split score
clf_single_score = clf.score(X_test, y_test)

# take mean of 5-fond CV score
clf_cv_score = np.mean(cross_val_score(clf,X,y,cv = 5))

#compare the two
clf_single_score, clf_cv_score

(0.8524590163934426, 0.8248087431693989)

In [24]:
# scoring parameter set to None by default, and when it's set to None, 
#it's going to use the default metric of the classifier
#which is the mean accuracy on the given test data and labels.
cross_val_score(clf, X, y, cv=5, scoring = None)

array([0.81967213, 0.86885246, 0.80327869, 0.76666667, 0.8       ])