# Module 7: Assessment & Measurements

### Introducing Evaluation Metrics

In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsRegressor

In [3]:
cc_df = pd.read_csv('https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv', encoding='latin-1')

In [6]:
train_df, test_df = train_test_split(cc_df, test_size=0.3, random_state=111)
train_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
64454,51150.0,-3.538816,3.481893,-1.82713,-0.57305,2.644106,-0.340988,2.102135,-2.939006,2.578654,...,0.530978,-0.860677,-0.20181,-1.719747,0.729143,-0.547993,-0.023636,-0.454966,1.0,0
37906,39163.0,-0.363913,0.853399,1.648195,1.118934,0.100882,0.423852,0.47279,-0.97244,0.033833,...,0.687055,-0.094586,0.121531,0.14683,-0.944092,-0.558564,-0.186814,-0.257103,18.49,0
79378,57994.0,1.193021,-0.136714,0.622612,0.780864,-0.823511,-0.706444,-0.206073,-0.016918,0.781531,...,-0.310405,-0.842028,0.085477,0.366005,0.254443,0.290002,-0.036764,0.015039,23.74,0
245686,152859.0,1.604032,-0.808208,-1.594982,0.200475,0.502985,0.83237,-0.034071,0.23404,0.550616,...,0.519029,1.429217,-0.139322,-1.293663,0.037785,0.061206,0.005387,-0.057296,156.52,0
60943,49575.0,-2.669614,-2.734385,0.66245,-0.059077,3.34685,-2.549682,-1.430571,-0.11845,0.469383,...,-0.228329,-0.370643,-0.211544,-0.300837,-1.17459,0.573818,0.388023,0.161782,57.5,0


In [7]:
train_df.shape

(199364, 31)

In [8]:
train_df.describe(include="all", percentiles=[])

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,...,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0,199364.0
mean,94888.815669,0.000492,-0.000726,0.000927,0.00063,3.6e-05,1.1e-05,-0.001286,-0.002889,-0.000891,...,0.001205,0.000155,-0.000198,0.000113,0.000235,0.000312,-0.000366,0.000227,88.164679,0.0017
std,47491.435489,1.95987,1.645519,1.505335,1.413958,1.361718,1.327188,1.210001,1.214852,1.096927,...,0.74851,0.726634,0.628139,0.60506,0.520857,0.48196,0.401541,0.333139,238.925768,0.041201
min,0.0,-56.40751,-72.715728,-31.813586,-5.683171,-42.147898,-26.160506,-43.557242,-73.216718,-13.320155,...,-34.830382,-8.887017,-44.807735,-2.824849,-10.295397,-2.24162,-22.565679,-11.710896,0.0,0.0
50%,84772.5,0.018854,0.065463,0.17908,-0.019531,-0.056703,-0.27529,0.040497,0.022039,-0.052607,...,-0.029146,0.007666,-0.011678,0.041031,0.016587,-0.05279,0.001239,0.011234,22.0,0.0
max,172792.0,2.451888,22.057729,9.382558,16.491217,34.801666,23.917837,44.054461,19.587773,15.594995,...,27.202839,10.50309,22.083545,4.022866,6.07085,3.517346,12.152401,33.847808,11898.09,1.0


In [9]:
X_train_big, y_train_big = train_df.drop(columns=["Class"]), train_df["Class"]
X_test, y_test = test_df.drop(columns=["Class"]), train_df["Class"]

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_big,
                                                      y_train_big,
                                                      test_size=0.3,
                                                      random_state=123)

In [15]:
dummy = DummyClassifier(strategy="most_frequent")
pd.DataFrame(cross_validate(dummy, X_train, y_train, return_train_score=True)).mean()

fit_time       0.030734
score_time     0.003932
test_score     0.998302
train_score    0.998302
dtype: float64

In [16]:
train_df["Class"].value_counts(normalize=True)

0    0.9983
1    0.0017
Name: Class, dtype: float64

In [21]:
pipe = make_pipeline(
    (StandardScaler()),
    (DecisionTreeClassifier(random_state=123))
)

In [22]:
pd.DataFrame(cross_validate(pipe, X_train, y_train, return_train_score=True)).mean()

fit_time       10.325765
score_time      0.018858
test_score      0.999119
train_score     1.000000
dtype: float64

In [23]:
train_df["Class"].value_counts(normalize=True)

0    0.9983
1    0.0017
Name: Class, dtype: float64

There are two kinds of binary classification problems:
* Distinguishing between two classes
* Spotting a class (fraud transaction, spam, disease)

**Confusion Matrix**

In [24]:
pipe.fit(X_train, y_train);

In [29]:
# plot_confusion_matrix(pipe, X_valid, y_valid, display_labels=["Non fraud", "Fraud"], values_format="d", cmap="Blues");

* **True negative**: negative example, predict negative
* **False negative**: positive example, predict negative
* **False positive**: negative example, predict positive
* **True positive**: positive example, predict positive

In [32]:
predictions = pipe.predict(X_valid)
confusion_matrix(y_valid, predictions)

array([[59674,    34],
       [   26,    76]])

### Precision, Recall and F1 Score

In [33]:
pipe_tree = make_pipeline(
    (StandardScaler()),
    (DecisionTreeClassifier(random_state=123))
)

In [34]:
pd.DataFrame(cross_validate(pipe_tree, X_train, y_train, return_train_score=True)).mean()

fit_time       11.937541
score_time      0.020289
test_score      0.999119
train_score     1.000000
dtype: float64

In [36]:
pipe_tree.fit(X_train,y_train);
predictions = pipe_tree.predict(X_valid)
confusion_matrix(y_valid, predictions)

array([[59674,    34],
       [   26,    76]])

In [37]:
TN, FP, FN, TP = confusion_matrix(y_valid, predictions).ravel()

**Recall**: among all positive examples, how many did you identify?

recall = TP/(TP + FN) = TP/(# positives)

In [39]:
recall = TP / (TP + FN)
recall.round(4)

0.7451

**Precision**: Among the positive examples you identified, how many were actually positive?

precision = TP/(TP + FP)

In [40]:
precision = TP /(TP + FP)
precision.round(4)

0.6909

**F1**: combines precision and recall to give one score

f1 = 2 * (precision * recall)/(precision + recall)

In [42]:
f1_score = (2 * precision * recall) / (precision + recall)
f1_score

0.7169811320754716

In [55]:
data={}
data["accuracy"] = [(TP+TN)/(TN+FP+FN+TP)]
data["accuracy"] =  [(FP+FN)/(TN+FP+FN+TP)]
data["precision"] = [TP/(TP+FP)]
data["recall"] =  [TP/(TP+FN)]
data["f1 score"] = [(2 * precision * recall) / (precision + recall)]
measures_df = pd.DataFrame(data, index=['ourselves'])

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
pred_cv = pipe_tree.predict(X_valid)
data["accuracy"].append(accuracy_score(y_valid, pred_cv))
data["accuracy"].append(1-accuracy_score(y_valid, pred_cv))
data["precision"].append(precision_score(y_valid, pred_cv, zero_division=1))
data["recall"].append(recall_score(y_valid, pred_cv))
data["f1 score"].append(f1_score(y_valid, pred_cv))

#pd.DataFrame(data, index=["ourselves", "sklearn"])

**Classification Report**

In [52]:
from sklearn.metrics import classification_report

In [53]:
pipe_tree.classes_

array([0, 1])

In [54]:
print(classification_report(y_valid, pipe_tree.predict(X_valid),
                            target_names=["non-fraud", "fraud"]))

              precision    recall  f1-score   support

   non-fraud       1.00      1.00      1.00     59708
       fraud       0.69      0.75      0.72       102

    accuracy                           1.00     59810
   macro avg       0.85      0.87      0.86     59810
weighted avg       1.00      1.00      1.00     59810



### Multi-Class Measurements

In [57]:
# from sklearn.datasets import load_digits
from sklearn.metrics import accuracy_score

```
digits = load_digits()
digits.images[-1]
```

```
X_train_digits, X_test_digits, y_train_digits, y_test_digits = train_test_split(
    digits['data'] / 16., digits['target'], random_state=0)

knn = KNeighborsClassifier().fit(X_train_digits, y_train_digits)
pred = knn.predict(X_test_digits)
print("Accuracy: ", accuracy_score(y_test_digits, pred).round(4))
```

```
plot_confusion_matrix(knn, X_test_digits, y_test_digits, cmap='gray_r');
plt.show()
```

```
print(classification_report(y_test_digits, pred, digits=4))
```

**Macro average vs weighted average**

```
print(classification_report(y_test_digits, pred, digits=4))
```

**Macro average**: Give equal importance to all classes.

**Weighted average**: Weighted by the number of samples in each class and divide by the total number of samples.

### Impalances Datasets

**Class imbalance in training sets**

In [63]:
X_train.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
80437,58486.0,0.984032,-1.851494,0.670618,-1.192458,-2.092441,-0.511208,-1.014356,-0.088517,-1.947026,...,0.187838,0.151899,0.293799,-0.200566,0.548729,0.207097,-0.159589,0.005833,0.061062,249.0
60984,49594.0,-0.985503,1.429365,1.528503,0.786004,-0.467397,-0.298816,0.080549,0.464257,-1.157092,...,0.255637,-0.129198,-0.451259,-0.104275,0.346654,-0.039245,0.367483,0.049164,0.053953,5.99
128056,78583.0,-1.20575,0.892452,2.380624,-0.016385,-0.442344,0.312907,0.158987,0.048223,0.990212,...,0.19252,-0.072519,0.168212,-0.157596,0.084111,-0.172349,0.281843,-0.126878,0.035593,11.5
71109,54160.0,-1.221491,0.584432,0.26096,-2.183469,0.148069,-0.353845,0.134643,0.220526,-1.162636,...,0.444476,0.182635,0.720601,-0.387976,-0.799762,0.172382,-0.28763,0.301891,0.010734,23.9
172062,120937.0,2.108906,-0.036505,-1.729322,-0.006799,0.594977,-0.414454,0.167962,-0.220672,0.330188,...,-0.166839,0.231473,0.833535,-0.151937,-1.046125,0.400704,-0.062815,-0.021197,-0.078745,0.89


In [64]:
y_train.value_counts('Class')

0    0.998302
1    0.001698
Name: Class, dtype: float64

**Addressing class imbalance**

A very important question to ask: why do I have a class imbalance?

* Is it because one class is much rarer than the other?
* Is it because of my data collection methods?

But if you answer "no" to both of these, it may be fine to just ignore the class imbalance.

**Handling imbalance**

There are two common approaches to this:
1. Changing the training procedure
2. Changing the data (not in this course)
* Undersampling
* Oversampling

In [65]:
tree_default = DecisionTreeClassifier(random_state=7)
tree_default.fit(X_train,y_train);

In [66]:
tree_100 = DecisionTreeClassifier(random_state=7, class_weight={1:100})
tree_100.fit(X_train,y_train);

**class_weight="balanced"**

In [68]:
tree_balanced=DecisionTreeClassifier(random_state=7, class_weight="balanced")
tree_balanced.fit(X_train,y_train);

In [69]:
tree_default.score(X_valid, y_valid)

0.9990971409463301

In [70]:
tree_balanced.score(X_valid,y_valid)

0.998913225213175

Is stratifying a good idea?

Yes and no:
* No longer a random sample
* It can be especially useful in multi-class situations.

But in general, these are difficult questions to answer.

### Regression Measurements

In [71]:
housing_df = pd.read_csv("data/housing.csv")
train_df, test_df = train_test_split(housing_df, test_size=0.1, random_state=123)

In [72]:
X_train = train_df.drop(columns=["median_house_value"])
y_train = train_df["median_house_value"]

X_test = test_df.drop(columns=["median_house_value"])
y_test = test_df["median_house_value"]

In [73]:
numeric_features = ["longitude", 
                    "latitude",
                    "housing_median_age",
                    "households",
                    "median_income",
                    "total_rooms",
                    "total_bedrooms",
                    "population"]

categorical_features = ["ocean_proximity"]

In [74]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
6051,-117.75,34.04,22.0,2948.0,636.0,2600.0,602.0,3.125,INLAND
20113,-119.57,37.94,17.0,346.0,130.0,51.0,20.0,3.4861,INLAND
14289,-117.13,32.74,46.0,3355.0,768.0,1457.0,708.0,2.6604,NEAR OCEAN
13665,-117.31,34.02,18.0,1634.0,274.0,899.0,285.0,5.2139,INLAND
14471,-117.23,32.88,18.0,5566.0,1465.0,6303.0,1458.0,1.858,NEAR OCEAN


In [90]:
#from sklearn.impute import SimpleImputer

```
numeric_transformer = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())

categorical_transformer = make_pipeline(
    SimpleImputer(strategy = "constant", fill_value="missing"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocessor = make_column_transformer(
    (numeric_transformer, numeric_features),
    (categorical_transformer, categorical_features),
    remainder='passthrough')

pipe = make_pipeline(preprocessor, KNeighborsRegressor())
pipe.fit(X_train,y_train);
```

```
predicted_y = pipe.predict(X_train)

predicted_y == y_train

y_train.values

predicted_y
```

**Regression Measurements**

**mean squared error (MSE)** = $\frac{1}{n} \sum_i^n({true_i - predicted_i})^2$ (n = total # of samples)

```
np.mean((y_train - predicted_y))**2
```

In [96]:
from sklearn.metrics import mean_squared_error

```
mean_squared_error(y_train, predicted_y)
```

**$R^2$**

The maximum value possible is 1 which means the model has perfect predictions. Negative values are very bad: "worse than baseline models such as DummyRegressor"

In [97]:
from sklearn.metrics import r2_score

```
r2_score(y_train, predicted_y)
```

**Root mean squared error (RMSE)**

$RMSE = \sqrt{MSE} = \sqrt{\frac{1}{n} \sum_i^n({true_i - predicted_i})^2}$

```
mean_squared_error(y_train, predicted_y)
np.sqrt(mean_squared_error(y_train, predicted_y))
```

**MAPE - Mean Absolute Percent Error**

```
percent_errors = (predicted_y - y_train)/y_train * 100
percent_errors.head()
np.abs(percent_errors).head()
100.*np.mean(np.abs((predicted_y - y_train)/y_train))
```

### Passing Different Scoring Methods

In [98]:
train_df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
6051,-117.75,34.04,22.0,2948.0,636.0,2600.0,602.0,3.125,113600.0,INLAND
20113,-119.57,37.94,17.0,346.0,130.0,51.0,20.0,3.4861,137500.0,INLAND
14289,-117.13,32.74,46.0,3355.0,768.0,1457.0,708.0,2.6604,170100.0,NEAR OCEAN
13665,-117.31,34.02,18.0,1634.0,274.0,899.0,285.0,5.2139,129300.0,INLAND
14471,-117.23,32.88,18.0,5566.0,1465.0,6303.0,1458.0,1.858,205000.0,NEAR OCEAN


**Cross Validation**

```
pd.DataFrame(cross_validate(pipe_tree, X_train, y_train, return_train_score=True, 
                            scoring = 'neg_root_mean_squared_error'))
```

In [102]:
from sklearn.metrics import make_scorer

In [103]:
def mape(true, pred):
    return 100.*np.mean(np.abs((pred - true)/true))

In [104]:
mape_scorer = make_scorer(mape)

```
pd.DataFrame(cross_validate(pipe_regression, X_train, y_train, return_train_score=True,
    scoring = mape_scorer))
```

```
scoring={ 
    "r2":"r2",
    "mape_score": mape_scorer,
    "neg_rmse": "neg_root_mean_squared_error",
    "neg_mse": "neg_mean_squared_error"
}

pd.DataFrame(cross_validate(pipe_regression, X_train, y_train, return_train_score=True, scoring=scoring))
```
        

**What about hyperparameter tuning?**

```
pipe_regression = make_pipeline(preprocessor, KNeighborsRegress())

param_grid = {"kneighborsregressor__n_neighbors": [2,5,50,100]}

grid_search = GridSearchCV(pipe_regression, param_grid, cv=5, return_train_score=True, 
                            n_jobs=-1, scoring = mape_scorer);
                            
grid_search.best_params_

grid_search.best_score_
```

```
neg_mape_scorer = make_scorer(mape, greater_is_better=False)

param_grid = {"kneighborsregressor__n_neighbors": [2,5,50,100]}

grid_search = GridSearchCV(pipe_regression, param_grid, cv=5, return_train_score=True, 
                            verbose=1, n_jobs=-1, scoring = mape_scorer);
                            
grid_search.fit(X_train, y_train);
                            
grid_search.best_params_

grid_search.best_score_
```

**Classification**

In [105]:
train_df, test_df = train_test_split(cc_df, test_size=0.3, random_state=111)
train_df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
64454,51150.0,-3.538816,3.481893,-1.82713,-0.57305,2.644106,-0.340988,2.102135,-2.939006,2.578654,...,0.530978,-0.860677,-0.20181,-1.719747,0.729143,-0.547993,-0.023636,-0.454966,1.0,0
37906,39163.0,-0.363913,0.853399,1.648195,1.118934,0.100882,0.423852,0.47279,-0.97244,0.033833,...,0.687055,-0.094586,0.121531,0.14683,-0.944092,-0.558564,-0.186814,-0.257103,18.49,0
79378,57994.0,1.193021,-0.136714,0.622612,0.780864,-0.823511,-0.706444,-0.206073,-0.016918,0.781531,...,-0.310405,-0.842028,0.085477,0.366005,0.254443,0.290002,-0.036764,0.015039,23.74,0
245686,152859.0,1.604032,-0.808208,-1.594982,0.200475,0.502985,0.83237,-0.034071,0.23404,0.550616,...,0.519029,1.429217,-0.139322,-1.293663,0.037785,0.061206,0.005387,-0.057296,156.52,0
60943,49575.0,-2.669614,-2.734385,0.66245,-0.059077,3.34685,-2.549682,-1.430571,-0.11845,0.469383,...,-0.228329,-0.370643,-0.211544,-0.300837,-1.17459,0.573818,0.388023,0.161782,57.5,0


In [106]:
X_train, y_train = train_df.drop(columns=["Class"]), train_df["Class"]
X_test, y_test = test_df.drop(columns=["Class"]), train_df["Class"]

In [107]:
dt_model = DecisionTreeClassifier(random_state=123, class_weight='balanced')

In [111]:
import scipy
from sklearn.model_selection import RandomizedSearchCV
param_grid = {"max_depth":scipy.stats.randint(low=1,high=100)}

```
grid_search = RandomizedSearchCV(dt_model, param_grid, cv=5, return_train_score=True,
                                 verbose=1, n_jobs=-1, scoring='f1', n_iter=6)
grid_search.fit(X_train, y_train);
```

```
grid_search.best_params_
grid_search.best_score_
```