In [None]:
# %autosave 0

# 4. Evaluation Metrics for Classification

In the previous session we trained a model for predicting churn. How do we know if it's good?


## 4.1 Evaluation metrics: session overview 

* Dataset: https://www.kaggle.com/blastchar/telco-customer-churn
* https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv


*Metric* - function that compares the predictions with the actual values and outputs a single number that tells how good the predictions are

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
# Read the dataset and store it in the 'df' dataframe
df = pd.read_csv('data-week-3.csv')

# Standardize column names by converting them to lowercase and replacing spaces with underscores
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Identify categorical columns (columns with object data type)
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

# Standardize categorical column values by converting them to lowercase and replacing spaces with underscores
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

# Convert 'totalcharges' column to numerical format (it was mistakenly identified as categorical)
# 'errors="coerce"' ensures that non-numeric values are turned into NaN instead of raising errors
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')

# Replace NaN values in 'totalcharges' with 0
df.totalcharges = df.totalcharges.fillna(0)

# Convert 'churn' column to integer format: 'yes' -> 1, 'no' -> 0
df.churn = (df.churn == 'yes').astype(int)


In [None]:
# Split the dataset into 'df_full_train' (80%) and 'df_test' (20%)
# 'random_state=1' ensures the split is reproducible
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Further split 'df_full_train' into 'df_train' (60%) and 'df_val' (20%)
# This results in an overall split of 60% training, 20% validation, and 20% testing
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Reset the indices for all datasets to ensure they have continuous indices
# 'drop=True' removes the old index to avoid confusion
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Extract the target variable ('churn') from each dataset
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

# Remove the 'churn' column from feature datasets to prevent data leakage during training
del df_train['churn']
del df_val['churn']
del df_test['churn']

In [None]:
# ‘numerical’ and ‘categorical’ contain the relevant column names. The ‘numerical’ array contains the names of all numerical columns, 
# while the ‘categorical’ array contains the names of all categorical columns.
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [None]:
# Create an instance of DictVectorizer with sparse=False to return dense arrays
dv = DictVectorizer(sparse=False)

# Convert the training dataframe into a list of dictionaries (one dictionary per record)
# This format is required by DictVectorizer to process the data
train_dict = df_train[categorical + numerical].to_dict(orient='records')

# Fit the DictVectorizer to the training data and transform it into a feature matrix
# 'fit_transform' learns the structure of the data, including column names and values
# It performs one-hot encoding for categorical values but ignores numeric values
X_train = dv.fit_transform(train_dict)

# Create a Logistic Regression model
model = LogisticRegression()

# Train the model using the transformed training data and corresponding target values
model.fit(X_train, y_train)


In [None]:
# Convert the validation DataFrame into a list of dictionaries (one dictionary per record)
# This transformation is necessary for DictVectorizer to process the validation data
val_dict = df_val[categorical + numerical].to_dict(orient='records')

# Transform the validation data using the trained DictVectorizer
# Unlike training, we only use 'transform' here since the DictVectorizer is already fitted
X_val = dv.transform(val_dict)

# Predict probabilities using the trained model
# 'predict_proba' returns two columns: probability of class 0 and class 1
# We extract the second column, which represents the probability of churn (class 1)
y_pred = model.predict_proba(X_val)[:, 1]

# Make churn decisions based on a threshold of 0.5
# If the predicted probability is >= 0.5, churn_decision is True; otherwise, it is False
churn_decision = (y_pred >= 0.5)

# Calculate the model's accuracy by comparing predicted churn decisions with actual values
# The 'mean' function computes the proportion of correct predictions
(y_val == churn_decision).mean()


## 4.2 Accuracy and dummy model

source: https://knowmledge.com/2023/10/03/ml-zoomcamp-2023-evaluation-metrics-for-classification-part-2/

#### In the last article, we calculated that our model achieved an accuracy of 80% on the validation data. 
#### Now, let’s determine whether this is a good value or not.

### What we will do

* Evaluate the model on different thresholds
* Check the accuracy of dummy baselines

In [None]:
len(y_val)

In [None]:
(y_val == churn_decision).mean()

In [None]:
1132/ 1409

### Evaluate the model on different thresholds

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score(y_val, y_pred >= 0.5)

In [None]:
thresholds = np.linspace(0, 1, 21)

scores = []

for t in thresholds:
    score = accuracy_score(y_val, y_pred >= t)
    print('%.2f %.3f' % (t, score))
    scores.append(score)

In [None]:
plt.plot(thresholds, scores)

### Check Accuracy of Dummy Baseline

In [None]:
from collections import Counter

In [None]:
Counter(y_pred >= 1.0)

In [None]:
# Distribution of y_val
Counter(y_val)

In [None]:
1023/1409

In [None]:
y_val.mean()

In [None]:
1 - y_val.mean()

The dataset has a class imbalance, with 27% churning customers and 73% non-churning customers. This makes accuracy misleading, as a dummy model predicting only the majority class could still achieve high accuracy while failing to identify the minority class (churning customers). To address this, alternative metrics should be used:

* Precision: Measures true positive predictions among all positive predictions, useful when false positives are costly.
* Recall: Measures true positive predictions among actual positives, important when false negatives are costly.
* F1-Score: Balances precision and recall, considering both false positives and negatives.
* AUC-ROC: Assesses the ability to distinguish between classes, particularly helpful for imbalanced datasets.


Choosing the right metric depends on the problem’s goals, with an emphasis on accurately identifying the minority class in cases of imbalance.

## 4.3 Confusion table / matrix

(source: https://knowmledge.com/2023/10/04/ml-zoomcamp-2023-evaluation-metrics-for-classification-part-3/)

A confusion matrix is a tool used to check how well a classification model is working. It helps us see the mistakes and correct predictions the model makes.

Sometimes, using only accuracy can be misleading, especially if the classes are imbalanced (one class appears much more than the other). The confusion matrix gives a better way to evaluate the model by breaking predictions into four types:
* True Positives (TP): The model correctly predicted the positive class (churning customers).
* True Negatives (TN): The model correctly predicted the negative class (non-churning customers).
* False Positives (FP): The model wrongly predicted the positive class (Type I error).
* False Negatives (FN): The model wrongly predicted the negative class (Type II error).


| g(xi) < t (NEGATIVE – NO CHURN) | g(xi) < t (NEGATIVE – NO CHURN) | g(xi) >= t (POSITIVE – CHURN) | g(xi) >= t (POSITIVE – CHURN) |
|----------------------------------|----------------------------------|--------------------------------|--------------------------------|
| **C didn’t churn** | **C churned** | **C didn’t churn** | **C churned** |
| correct | incorrect | incorrect | correct |
| **TRUE NEGATIVE (TN)** | **FALSE NEGATIVE (FN)** | **FALSE POSITIVE (FP)** | **TRUE POSITIVE (TP)** |
| g(xi) < t & y = 0 | g(xi) < t & y = 1 | g(xi) >= t & y = 0 | g(xi) >= t & y = 1 |



Here, we will do the following:

* Different types of errors and correct decisions
* Arranging them in a table


### Different types of errors and correct decisions

In [None]:
# people who are going to churn
actual_positive = (y_val == 1)
# people who are not going to churn
actual_negative = (y_val == 0)

In [None]:
t = 0.5
predict_positive = (y_pred >= t)
predict_negative = (y_pred < t)

In [None]:
predict_positive & actual_positive

In [None]:
tp = (predict_positive & actual_positive).sum()
tp

In [None]:
tn = (predict_negative & actual_negative).sum()
tn

In [None]:
fp = (predict_positive & actual_negative).sum()
fp

In [None]:
fn = (predict_negative & actual_positive).sum()
fn

### Arranging them in a table

That was preparation for understanding the confusion matrix. The confusion matrix is a way to consolidate all these values (tp, tn, fp, fn) into a single table. This table comprises 4 cells, forming a 2×2 matrix.

* In the columns of this table, we have the predictions (NEGATIVE: g(xi) < t and POSITIVE: g(xi) >= t).
* In the rows, we have the actual values (NEGATIVE: y=0 and POSITIVE: y=1).

Now, let’s proceed to implement this confusion matrix in NumPy.

In [None]:
confusion_matrix = np.array([
    [tn, fp],
    [fn, tp]
])
confusion_matrix

|                | **NO CHURN** <br> *(g(xi) < t) NEGATIVE* | **CHURN** <br> *(g(xi) >= t) POSITIVE* |
|:--------------:|:--------------------------------:|:-----------------------------:|
| **NO CHURN** <br> *y=0 NEGATIVE* | **True Negative TN** <br> 922 <br> **65%** | **False Positive FP** <br> 101 <br> **8%** |
| **CHURN** <br> *y=1 POSITIVE*   | **False Negative FN** <br> 176 <br> **12%** | **True Positive TP** <br> 210 <br> **15%** |


In [None]:
(confusion_matrix / confusion_matrix.sum()).round(2)

## 4.4 Precision and Recall

(source: https://knowmledge.com/2023/10/05/ml-zoomcamp-2023-evaluation-metrics-for-classification-part-4/)


Precision and Recall are essential metrics for evaluating binary classification models.

**Precision** measures the fraction of positive predictions that were correct. In other words, it quantifies how accurately the model predicts customers who are likely to churn.

Precision = True Positives / (# Positive Predictions) = True Positives / (True Positives + False Positives)

**Recall**, on the other hand, quantifies the fraction of actual positive cases that were correctly identified by the model. It assesses how effectively the model captures all customers who are actually churning.

Recall = True Positives / (# Positive Observations) = True Positives / (True Positives + False Negatives)

In summary, precision focuses on the accuracy of positive predictions, while recall emphasizes the model’s ability to capture all positive cases. These metrics are crucial for understanding the trade-offs between correctly identifying churning customers and minimizing false positives.

| **Actual Values**        | **Negative Predictions** <br> *(g(xi) < t)* | **Positive Predictions** <br> *(g(xi) >= t)* |
|:------------------------:|:--------------------------------:|:-----------------------------:|
| **Negative Example** <br> *y=0* | **TN** | **FP** |
| **Positive Example** <br> *y=1* | **FN** | **TP** |
<p style="text-align: center">Confusion Matrix Recall = TP / (TP + FN) &nbsp;&nbsp; Precision = TP / (TP + FP)</p>

In [None]:
accuracy = (tp + tn) / (tp + tn + fp + fn)
accuracy

In [None]:
precision = tp / (tp + fp)
precision

In [None]:
# --> promotional email goes to 311 people, but 210 are actually going to churn (--> 33% are mistakes)
tp + fp

In [None]:
recall = tp / (tp + fn)
recall

In [None]:
# --> For 46% of people who are churning we failed to identify them
tp + fn

## 4.5 ROC Curves

(source: https://knowmledge.com/2023/10/06/ml-zoomcamp-2023-evaluation-metrics-for-classification-part-5/)

ROC (Receiver Operating Characteristic) curves help evaluate how well a binary classification model distinguishes between two classes. They show the trade-off between false positives and true positives at different decision thresholds.

The curve is created by plotting True Positive Rate (TPR) against False Positive Rate (FPR) for different thresholds. The AUC-ROC (Area Under the Curve - ROC) measures overall performance—higher values mean better model discrimination.

ROC curves help choose the best threshold by balancing false positives and true positives based on the problem’s needs.

| Actual Values     | Negative Predictions (g(xᵢ) < t) | Positive Predictions (g(xᵢ) ≥ t) |
|:----------------:|:--------------------------------:|:--------------------------------:|
| **Negative Example (y=0)** | **TN** | **FP** |
|  |  | **FPR = FP / (TN + FP)** |
| **Positive Example (y=1)** | **FN** | **TP** |
|  |  | **TPR = TP / (FN + TP)** |

<p style="text-align: center">Confusion matrix: FPR – False Positive Rate, TPR – True Positive Rate</p>


### TPR and FRP

In [None]:
tpr = tp / (tp + fn)
tpr

In [None]:
fpr = fp / (fp + tn)
fpr

In [None]:
# The ROC curve is a useful visualization tool that allows you to assess the performance of a binary classification model across a range of decision thresholds.

scores = []

thresholds = np.linspace(0, 1, 101)

for t in thresholds:
    actual_positive = (y_val == 1)
    actual_negative = (y_val == 0)
    
    predict_positive = (y_pred >= t)
    predict_negative = (y_pred < t)

    tp = (predict_positive & actual_positive).sum()
    tn = (predict_negative & actual_negative).sum()

    fp = (predict_positive & actual_negative).sum()
    fn = (predict_negative & actual_positive).sum()
    
    scores.append((t, tp, fp, fn, tn))
# scores

In [None]:
columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
df_scores = pd.DataFrame(scores, columns=columns)
df_scores

In [None]:
# We can look at each tenth record by using this column 10 operator. This works by printing every record starting from the 
# first record and moving forward with increments of 10.
df_scores[::10]

In [None]:
df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)
df_scores[::10]

In [None]:
plt.plot(df_scores.threshold, df_scores['tpr'], label='TPR')
plt.plot(df_scores.threshold, df_scores['fpr'], label='FPR')
plt.legend()

### Random model

In [None]:
np.random.seed(1)
y_rand = np.random.uniform(0, 1, size=len(y_val))
y_rand.round(3)

In [None]:
# Accuracy for our random model is around 50%
((y_rand >= 0.5) == y_val).mean()

In [None]:
#Let’s put the previously used code into a function.
def tpr_fpr_dataframe(y_val, y_pred):
    scores = []

    thresholds = np.linspace(0, 1, 101)

    for t in thresholds:
        actual_positive = (y_val == 1)
        actual_negative = (y_val == 0)

        predict_positive = (y_pred >= t)
        predict_negative = (y_pred < t)

        tp = (predict_positive & actual_positive).sum()
        tn = (predict_negative & actual_negative).sum()

        fp = (predict_positive & actual_negative).sum()
        fn = (predict_negative & actual_positive).sum()

        scores.append((t, tp, fp, fn, tn))

    columns = ['threshold', 'tp', 'fp', 'fn', 'tn']
    df_scores = pd.DataFrame(scores, columns=columns)

    df_scores['tpr'] = df_scores.tp / (df_scores.tp + df_scores.fn)
    df_scores['fpr'] = df_scores.fp / (df_scores.fp + df_scores.tn)
    
    return df_scores

In [None]:
df_rand = tpr_fpr_dataframe(y_val, y_rand)
df_rand[::10]

In [None]:
plt.plot(df_rand.threshold, df_rand['tpr'], label='TPR')
plt.plot(df_rand.threshold, df_rand['fpr'], label='FPR')
plt.legend()

### Ideal model

Now, let’s discuss the concept of an ideal model that makes correct predictions for every example. To implement this, we need to determine the number of negative examples, which corresponds to the number of people who are not churning in our dataset.

In [None]:
num_neg = (y_val == 0).sum()
num_pos = (y_val == 1).sum()
num_neg, num_pos

In [None]:
y_ideal = np.repeat([0, 1], [num_neg, num_pos])
y_ideal

In [None]:
y_ideal_pred = np.linspace(0, 1, len(y_val))
y_ideal_pred

In [None]:
1 - y_val.mean()

In [None]:
accuracy_ideal = ((y_ideal_pred >= 0.726) == y_ideal).mean()
accuracy_ideal

In [None]:
df_ideal = tpr_fpr_dataframe(y_ideal, y_ideal_pred)
df_ideal[::10]

In [None]:
plt.plot(df_ideal.threshold, df_ideal['tpr'], label='TPR')
plt.plot(df_ideal.threshold, df_ideal['fpr'], label='FPR')
plt.legend()

### Putting everything together
Now let’s try to plot all the models together so we can hold the benchmarks together.

In [None]:
plt.plot(df_scores.threshold, df_scores['tpr'], label='TPR', color='black')
plt.plot(df_scores.threshold, df_scores['fpr'], label='FPR', color='blue')

# plt.plot(df_rand.threshold, df_rand['tpr'], label='TPR random', color='grey')
# plt.plot(df_rand.threshold, df_rand['fpr'], label='FPR random', color='grey')

plt.plot(df_ideal.threshold, df_ideal['tpr'], label='TPR ideal')
plt.plot(df_ideal.threshold, df_ideal['fpr'], label='FPR ideal')

plt.legend()

In [None]:
plt.figure(figsize=(5, 5))

plt.plot(df_scores.fpr, df_scores.tpr, label='Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

In [None]:
### We can also use the ROC functionality of scikit learn package
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
plt.figure(figsize=(5, 5))

plt.plot(fpr, tpr, label='Model')
plt.plot([0, 1], [0, 1], label='Random', linestyle='--')

plt.xlabel('FPR')
plt.ylabel('TPR')

plt.legend()

#### What kind of information do we get from ROC curve?

ROC curves show how a model works at different thresholds. Here’s a simple breakdown:

1. **Starting Point (Lower-Left Corner):**
   - **Threshold = 1.0**: The model is very strict. It predicts *no one* will churn (all are non-churning).
   - **TPR (True Positive Rate) = 0**: No churning customers are correctly identified.
   - **FPR (False Positive Rate) = 0**: No mistakes (no non-churning customers are wrongly labeled as churning).

2. **Moving Right (Lower Thresholds):**
   - As the threshold drops, the model predicts *more* customers will churn.
   - **TPR increases**: More churning customers are correctly found.
   - **FPR increases**: More non-churning customers are wrongly labeled as churning.

3. **Ending Point (Upper-Right Corner):**
   - **Threshold = 0.0**: The model predicts *everyone* will churn.
   - **TPR = 100%**: All churning customers are found.
   - **FPR = 100%**: All non-ch customersurning are wrongly labeled as churning.

4. **What the Curve Shows:**
   - Each point on the ROC curve represents a different threshold.
   - A curve closer to the top-left corner (ideal) means the model works well.
   - A curve close to the diagonal line (random baseline) means the model is no better than guessing.

5. **AUC (Area Under the Curve):**
   - AUC measures the overall performance of the model.
   - Higher AUC = better at distinguishing churning vs. non-churning customers.

In short, ROC curves help pick the best threshold to balance finding true churning customers (high TPR) while avoiding false alarms (low FPR).

## 4.6 ROC AUC

(source: https://knowmledge.com/2023/10/07/ml-zoomcamp-2023-evaluation-metrics-for-classification-part-6/)

* Area under the ROC curve - useful metric
* Interpretation of AUC

#### Useful metric

In [None]:
from sklearn.metrics import auc

In [None]:
# auc needs values for x-axis and y-axis
auc(fpr, tpr)

In [None]:
auc(df_scores.fpr, df_scores.tpr)

In [None]:
auc(df_ideal.fpr, df_ideal.tpr)

In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_pred)
auc(fpr, tpr)

In [None]:
# There is a shortcut in scikit-learn package
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_val, y_pred)  # same result as above

#### AUC interpretation

In [None]:
neg = y_pred[y_val == 0]
pos = y_pred[y_val == 1]

In [None]:
import random

In [None]:
pos_ind = random.randint(0, len(pos) -1)
neg_ind = random.randint(0, len(neg) -1)

In [None]:
pos[pos_ind] > neg[neg_ind]

In [None]:
n = 100000
success = 0 

for i in range(n):
    pos_ind = random.randint(0, len(pos) - 1)
    neg_ind = random.randint(0, len(neg) - 1)

    if pos[pos_ind] > neg[neg_ind]:
        success = success + 1

success / n

In [None]:
n = 50000

np.random.seed(1)
pos_ind = np.random.randint(0, len(pos), size=n)
neg_ind = np.random.randint(0, len(neg), size=n)

(pos[pos_ind] > neg[neg_ind]).mean()

## 4.7 Cross-Validation

(source: https://knowmledge.com/2023/10/08/ml-zoomcamp-2023-evaluation-metrics-for-classification-part-7/)

* Evaluating the same model on different subsets of data
* Getting the average prediction and the spread within predictions

#### Evaluating the same model on different subsets of data

Parameter tuning means picking the best settings for a model. To do this, we split our data into three parts: training, validation, and testing. The validation set helps us find the best parameters. The test set is kept aside for now.

We combine the training and validation sets into full_train and split it into three (k=3) smaller parts.

1. Train on parts 1 & 2, validate on part 3, and calculate AUC.
2. Train on parts 1 & 3, validate on part 2, and calculate AUC.
3. Train on parts 2 & 3, validate on part 1, and calculate AUC.

After this, we find the mean and standard deviation of the AUC values. The standard deviation tells us how stable the model is across different splits.

This method is called K-Fold Cross-Validation, and it helps test a model on different parts of the dataset to ensure it's reliable.

In [None]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [None]:
dv, model = train(df_train, y_train, C=0.001)

In [None]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [None]:
y_pred = predict(df_val, dv, model)
y_pred

In [None]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=10, shuffle=True, random_state=1) 

In [None]:
kfold.split(df_full_train)
# Output: <generator object _BaseKFold.split at 0x2838baf20>

In [None]:
train_idx, val_idx = next(kfold.split(df_full_train))
len(train_idx), len(val_idx)

In [None]:
len(df_full_train)

In [None]:
# We can use iloc to select a part of this dataframe
df_train = df_full_train.iloc[train_idx]
df_val = df_full_train.iloc[val_idx]

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
 
kfold = KFold(n_splits=10, shuffle=True, random_state=1)  
scores = []
 
for train_idx, val_idx in kfold.split(df_full_train):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
 
    y_train = df_train.churn.values
    y_val = df_val.churn.values
 
    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)
 
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)
 
scores

In [None]:
!pip install tqdm

In [None]:
from tqdm.auto import tqdm

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=1)  
scores = []
 
for train_idx, val_idx in tqdm(kfold.split(df_full_train)):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
 
    y_train = df_train.churn.values
    y_val = df_val.churn.values
 
    dv, model = train(df_train, y_train)
    y_pred = predict(df_val, dv, model)
 
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

#### Getting the average prediction and the spread within predictions

In [None]:
print('%.3f +- %.3f' % (np.mean(scores), np.std(scores)))

#### Parameter Tuning

In [None]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')
 
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)
 
    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
 
    return dv, model

In [None]:
dv, model = train(df_train, y_train, C=0.001)

In [None]:
from sklearn.model_selection import KFold
 
kfold = KFold(n_splits=10, shuffle=True, random_state=1)  
 
for C in [0.001, 0.01, 0.1, 0.5, 1, 5, 10]:
     
    scores = []
 
    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]
 
        y_train = df_train.churn.values
        y_val = df_val.churn.values
 
        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)
 
        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)
 
    print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))
 

In [None]:
scores

In [None]:
n_splits = 5

for C in tqdm([0.001, 0.01, 0.1, 0.5, 1, 5, 10]):
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

    scores = []

    for train_idx, val_idx in kfold.split(df_full_train):
        df_train = df_full_train.iloc[train_idx]
        df_val = df_full_train.iloc[val_idx]

        y_train = df_train.churn.values
        y_val = df_val.churn.values

        dv, model = train(df_train, y_train, C=C)
        y_pred = predict(df_val, dv, model)

        auc = roc_auc_score(y_val, y_pred)
        scores.append(auc)

    print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

In [None]:
dv, model = train(df_full_train, df_full_train.churn.values, C=1.0)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
auc

## 4.8 Summary

* Metric - a single number that describes the performance of a model
* Accuracy - fraction of correct answers; sometimes misleading 
* Precision and recall are less misleading when we have class inbalance
* ROC Curve - a way to evaluate the performance at all thresholds; okay to use with imbalance
* K-Fold CV - more reliable estimate for performance (mean + std)

## 4.9 Explore more

* Check the precision and recall of the dummy classifier that always predict "FALSE"
* F1 score = 2 * P * R / (P + R)
* Evaluate precision and recall at different thresholds, plot P vs R - this way you'll get the precision/recall curve (similar to ROC curve)
* Area under the PR curve is also a useful metric

Other projects:

* Calculate the metrics for datasets from the previous week