In [None]:
# hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# default_exp k_fold_validation

# export
import sys
from random import random
import numpy as np
import pandas as pd
import sklearn
import logging
from sklearn.model_selection import KFold

from job_offer_classifier.pipeline_classifier import Pipeline
import tensorflow as tf

In [None]:
# hide
#logging config
logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', level=logging.INFO, datefmt='%I:%M:%S')
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

# K Fold Validation 
> Aggregates the K fold validation to the pipeline classifier. 

To assess the performance of the model the, sklearn K fold validation  method is incorporated. After running the *k-fold* method the averaged scores are computed.

## Sklearn K Fold Validation 

In [None]:
# export 
def k_fold_validation(X,n_splits=4):
    kf = KFold(n_splits=n_splits)
    kf.get_n_splits(X)
    for train_index, test_index in kf.split(X):
        yield X.iloc[train_index], X.iloc[test_index]

Check the train and test set splitting of of equal size

In [None]:
assert all(train.shape[0] == test.shape[0]  for train,test in k_fold_validation(pd.DataFrame(range(100)),n_splits=2))

## Average over scores

In [None]:
# export 
def score_averages(**k_fold_evaluations):
    fold1 = next(iter(k_fold_evaluations.values()))
    split_names = fold1.keys()
    score_names = next(iter(fold1.values())).keys()
    K = len(k_fold_evaluations)
    return {
        split: {
            score_name: sum(
                k_fold_evaluations[k][split][score_name]
                for k in k_fold_evaluations.keys()
            ) / K
            for score_name in score_names
        }
        for split in split_names
    }

Consider the case of two folds in a train set with 'acc' and 'f1' scores

In [None]:
rnd1, rnd2, rnd3, rnd4 = [random() for _ in range(4)]
fold1 = {'train': {'acc': rnd1, 'f1': rnd2}}
fold2 = {'train': {'acc': rnd3, 'f1': rnd4}}
avg_acc, avg_f1 = (rnd1 + rnd3) / 2, (rnd2 + rnd4) / 2

train_score_avgs = {'train': {'acc': avg_acc, 'f1': avg_f1}}

assert score_averages(fold1=fold1, fold2=fold2) == train_score_avgs

In [None]:
# export

class KFoldPipeline(Pipeline):
    '''K fold validation over the model built in `Pipeline` class
    '''
    def __init__(self, dataset_file, n_splits=4):
        self.n_splits = n_splits
        self.k_fold_evaluations = {}
        Pipeline.__init__(self, src_file=dataset_file)

    def k_fold_validation(self):
        ''' Implements the `pipeline` method for each fold.
            The averaged score is stored in `avg_evaluation`
        '''
        for k, split in enumerate(
            k_fold_validation(self.data, n_splits=self.n_splits)
        ):
            self.dfs = {'train': split[0], 'test': split[1]}
            self.pipeline()
            self.k_fold_evaluations[str(k + 1)] = self.evaluation

            logging.info(f'fold {k+1} has finished...')
            for key in ('accuracy','f1_score'):
                logging.info(
                    'The %s score for the training set in fold %s is %s' %
                    (key,k+1,self.evaluation['train'][key])
                )

        self.avg_evaluation = score_averages(**self.k_fold_evaluations)

In [None]:
from nbdev.showdoc import *
show_doc(KFoldPipeline.k_fold_validation)

<h4 id="KFoldPipeline.k_fold_validation" class="doc_header"><code>KFoldPipeline.k_fold_validation</code><a href="__main__.py#L10" class="source_link" style="float:right">[source]</a></h4>

> <code>KFoldPipeline.k_fold_validation</code>()

Implements the `pipeline` method for each fold. 
The averaged score is stored in `avg_evaluation`

*The case $k=1$*  
This case is used to check the pipeline (through the info logging)

In [None]:
kfp = KFoldPipeline(dataset_file='../data/interim/payloads.csv',n_splits=2)
kfp.train_steps = 100
kfp.k_fold_validation()

08:58:14 INFO: Using /tmp/tfhub_modules to cache modules.
08:58:26 INFO: fold 1 has finished...
08:58:26 INFO: The accuracy score for the training set in fold 1 is 1.0
08:58:26 INFO: The f1_score score for the training set in fold 1 is 1.0
08:58:38 INFO: fold 2 has finished...
08:58:38 INFO: The accuracy score for the training set in fold 2 is 0.8296703
08:58:38 INFO: The f1_score score for the training set in fold 2 is 0.8724279570683305


*The case $k=4$*  
This case represents the actual assesment of the model perfomance 

In [None]:
kfp = KFoldPipeline(dataset_file='../data/interim/payloads.csv',n_splits=5)
kfp.train_steps = 5000 #default
kfp.k_fold_validation()

09:17:30 INFO: fold 1 has finished...
09:17:30 INFO: The accuracy score for the training set in fold 1 is 0.99655175
09:17:30 INFO: The f1_score score for the training set in fold 1 is 0.9979633420684656
09:18:06 INFO: fold 2 has finished...
09:18:06 INFO: The accuracy score for the training set in fold 2 is 0.98275864
09:18:06 INFO: The f1_score score for the training set in fold 2 is 0.9905482064335733
09:18:43 INFO: fold 3 has finished...
09:18:43 INFO: The accuracy score for the training set in fold 3 is 0.9862069
09:18:43 INFO: The f1_score score for the training set in fold 3 is 0.9908257247813479
09:19:24 INFO: fold 4 has finished...
09:19:24 INFO: The accuracy score for the training set in fold 4 is 0.9862543
09:19:24 INFO: The f1_score score for the training set in fold 4 is 0.9908675778009323
09:20:03 INFO: fold 5 has finished...
09:20:03 INFO: The accuracy score for the training set in fold 5 is 0.9862543
09:20:03 INFO: The f1_score score for the training set in fold 5 is 0.

The  averaged evaluation is in `avg_evaluation` atrribute

In [None]:
kfp.avg_evaluation['train']

{'accuracy': 0.9876051664352417,
 'accuracy_baseline': 0.7989667177200317,
 'auc': 0.9924227833747864,
 'auc_precision_recall': 0.9978546500205994,
 'average_loss': 0.0626331850886345,
 'label/mean': 0.7989667177200317,
 'loss': 0.05038395039737224,
 'precision': 0.9872474074363708,
 'prediction/mean': 0.799962329864502,
 'recall': 0.9972434759140014,
 'global_step': 5000.0,
 'f1_score': 0.9922144857770503}

In [None]:
kfp.avg_evaluation['test']

{'accuracy': 0.9259893417358398,
 'accuracy_baseline': 0.8465753316879272,
 'auc': 0.36761903762817383,
 'auc_precision_recall': 0.9573580622673035,
 'average_loss': 0.1822646100074053,
 'label/mean': 0.7999999940395355,
 'loss': 0.1822646100074053,
 'precision': 0.9399641513824463,
 'prediction/mean': 0.7845720887184143,
 'recall': 0.920714282989502,
 'global_step': 5000.0,
 'f1_score': 0.9279590007548488}

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()