In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from os import listdir
from os.path import join, isdir

from jenkspy import jenks_breaks

from math import inf

import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'axes.unicode_minus': False,
})

# Preprocessing

In [2]:
def get_csv(path):
    if not isdir(path):
        print('Error: {} is not a directory'.format(path))
        return
    files = [join(path,file) for file in listdir(path) if file[-4:] == '.csv']
    if len(files) == 0:
        print('Error: No CSV files in {}'.format(path))
        return
    elif len(files) > 1:
        print('Warning: Multiple CSV files in {}'.format(path))
        print('Choose CSV file')
        for idx,file in enumerate(files):
            print('{}:\t{}'.format(idx, file))
        return files[int(input('Enter Option [0-{}]: '.format(len(files)-1)))]    
        
    
    return files[0]     

In [3]:
data_dir = join('data', 'cycling')

In [4]:
train_data = pd.read_csv(get_csv(join(data_dir,'train')))
test_data = pd.read_csv(get_csv(join(data_dir,'test')))

In [5]:
columns = ['moving_time','avg_speed','max_speed','elevation_gain','avg_hr','max_hr','calories','avg_cadence','max_cadence']
names = ['Moving Time','Avg Speed','Max Speed','Elevation Gain','Avg HR','Max HR','Calories','Avg Cadence','Max Cadence']

target = 'avg_power'
tar_name = 'Avg Power'

In [6]:
train_breaks = jenks_breaks(train_data[target], nb_class=2)
train_breaks[0] = -inf
train_breaks[-1] = inf
train_labels = np.ravel(np.digitize(train_data[target], train_breaks))-1
test_labels = np.ravel(np.digitize(test_data[target], train_breaks))-1

train_data = train_data[columns]
test_data = test_data[columns]

In [7]:
train_data.to_pickle(join(data_dir,'train','train_data.pkl'))
test_data.to_pickle(join(data_dir,'test','test_data.pkl'))

In [8]:
np.save(join(data_dir,'train','train_labels.npy'), train_labels)
np.save(join(data_dir,'test','test_labels.npy'), test_labels)

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

value_idx, eval_idx = list(StratifiedShuffleSplit(n_splits=1, train_size=0.1).split(train_data, train_labels))[0]

value_data = train_data.loc[value_idx,:].reset_index()
eval_data = train_data.loc[eval_idx,:].reset_index()

value_labels = train_labels[value_idx]
eval_labels = train_labels[eval_idx]

In [10]:
value_data.to_pickle(join(data_dir, 'train', 'value_data.pkl'))
eval_data.to_pickle(join(data_dir, 'train', 'eval_data.pkl'))

In [11]:
np.save(join(data_dir, 'train', 'value_labels.npy'), value_labels)
np.save(join(data_dir, 'train', 'eval_labels.npy'), eval_labels)

# LOOV

In [12]:
train_data = pd.read_pickle(join(data_dir,'train','train_data.pkl'))
test_data = pd.read_pickle(join(data_dir,'test','test_data.pkl'))

In [13]:
train_labels = np.load(join(data_dir,'train','train_labels.npy'))
test_labels = np.load(join(data_dir,'test','test_labels.npy'))

In [14]:
value_data = pd.read_pickle(join(data_dir, 'train', 'value_data.pkl'))
eval_data = pd.read_pickle(join(data_dir, 'train', 'eval_data.pkl'))

In [15]:
value_labels = np.load(join(data_dir, 'train', 'value_labels.npy'))
eval_labels = np.load(join(data_dir, 'train', 'eval_labels.npy'))

In [16]:
%load_ext autoreload
%autoreload 2
from DShap import DShap

In [None]:
import pickle as pck

model_families = ['NB', 'KNN']
metrics = ['accuracy', 'auc']

for model_family in model_families:
    for metric in metrics:
        for seed in range(1,4):
            dshap = DShap(value_data.to_numpy(), value_labels, eval_data.to_numpy(), eval_labels, 1162, 
                          directory=join('output','cycling',model_family,metric), model_family=model_family, metric=metric, seed=seed)
            dshap.run(100, 0.1)
        dshap.merge_results()
        
        fig = dshap.performance_plots([dshap.values_tmc, dshap.vals_loo], num_plot_markers=20,
                                      sources=None, order='d')
        plt.savefig(join('output','cycling',model_family,metric,'plots','descend.pgf'), bbox_inches='tight')
        plt.savefig(join('output','cycling',model_family,metric,'plots','descend.pdf'), bbox_inches='tight')
        
        fig = dshap.performance_plots([dshap.values_tmc, dshap.vals_loo], num_plot_markers=20,
                                      sources=None, order='a')
        plt.savefig(join('output','cycling',model_family,metric,'plots','ascend.pgf'), bbox_inches='tight')
        plt.savefig(join('output','cycling',model_family,metric,'plots','ascend.pdf'), bbox_inches='tight')
        
        values = {}
        values['loo'] = dshap.vals_loo
        values['shapley'] = dshap.values_tmc
        
        pck.dump(values, open(join('output','cycling',model_family,metric,'values.pkl'), 'wb'))

Starting LOO score calculations!


  0%|          | 0/100 [00:00<?, ?it/s]

LOO values calculated!


100%|██████████| 100/100 [00:09<00:00, 10.91it/s]
100%|██████████| 100/100 [00:07<00:00, 13.53it/s]
100%|██████████| 100/100 [00:09<00:00, 10.44it/s]
100%|██████████| 100/100 [00:08<00:00, 11.94it/s]
100%|██████████| 100/100 [00:07<00:00, 12.94it/s]
100%|██████████| 100/100 [00:09<00:00, 10.30it/s]
100%|██████████| 100/100 [00:07<00:00, 12.64it/s]
100%|██████████| 100/100 [00:08<00:00, 11.48it/s]
100%|██████████| 100/100 [00:07<00:00, 12.67it/s]
100%|██████████| 100/100 [00:09<00:00, 10.34it/s]
100%|██████████| 100/100 [00:08<00:00, 11.90it/s]
100%|██████████| 100/100 [00:08<00:00, 11.56it/s]
100%|██████████| 100/100 [00:10<00:00,  9.82it/s]
100%|██████████| 100/100 [00:07<00:00, 13.21it/s]
100%|██████████| 100/100 [00:07<00:00, 12.62it/s]
100%|██████████| 100/100 [00:07<00:00, 13.08it/s]
100%|██████████| 100/100 [00:08<00:00, 12.01it/s]
100%|██████████| 100/100 [00:08<00:00, 11.73it/s]
100%|██████████| 100/100 [00:08<00:00, 11.92it/s]
100%|██████████| 100/100 [00:09<00:00, 11.02it/s]


LOO values calculated!


100%|██████████| 100/100 [00:08<00:00, 11.53it/s]
100%|██████████| 100/100 [00:07<00:00, 13.64it/s]
100%|██████████| 100/100 [00:08<00:00, 12.29it/s]
100%|██████████| 100/100 [00:08<00:00, 11.72it/s]
100%|██████████| 100/100 [00:08<00:00, 12.34it/s]
100%|██████████| 100/100 [00:08<00:00, 12.11it/s]
100%|██████████| 100/100 [00:08<00:00, 11.65it/s]
100%|██████████| 100/100 [00:08<00:00, 12.18it/s]
100%|██████████| 100/100 [00:08<00:00, 12.49it/s]
 65%|██████▌   | 65/100 [00:04<00:03, 10.98it/s]