In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from os import listdir
from os.path import join, isdir

from jenkspy import jenks_breaks

from math import inf

import matplotlib
import matplotlib.pyplot as plt

matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
    'axes.unicode_minus': False,
})

# Preprocessing

In [2]:
def get_csv(path):
    if not isdir(path):
        print('Error: {} is not a directory'.format(path))
        return
    files = [join(path,file) for file in listdir(path) if file[-4:] == '.csv']
    if len(files) == 0:
        print('Error: No CSV files in {}'.format(path))
        return
    elif len(files) > 1:
        print('Warning: Multiple CSV files in {}'.format(path))
        print('Choose CSV file')
        for idx,file in enumerate(files):
            print('{}:\t{}'.format(idx, file))
        return files[int(input('Enter Option [0-{}]: '.format(len(files)-1)))]    
        
    
    return files[0]     

In [3]:
data_dir = join('data', 'cycling')

In [4]:
train_data = pd.read_csv(get_csv(join(data_dir,'train')))
test_data = pd.read_csv(get_csv(join(data_dir,'test')))

In [5]:
columns = ['moving_time','avg_speed','max_speed','elevation_gain','avg_hr','max_hr','calories','avg_cadence','max_cadence']
names = ['Moving Time','Avg Speed','Max Speed','Elevation Gain','Avg HR','Max HR','Calories','Avg Cadence','Max Cadence']

target = 'avg_power'
tar_name = 'Avg Power'

In [6]:
train_breaks = jenks_breaks(train_data[target], nb_class=2)
train_breaks[0] = -inf
train_breaks[-1] = inf
train_labels = np.ravel(np.digitize(train_data[target], train_breaks))-1
test_labels = np.ravel(np.digitize(test_data[target], train_breaks))-1

train_data = train_data[columns]
test_data = test_data[columns]

In [7]:
train_data.to_pickle(join(data_dir,'train','train_data.pkl'))
test_data.to_pickle(join(data_dir,'test','test_data.pkl'))

In [8]:
np.save(join(data_dir,'train','train_labels.npy'), train_labels)
np.save(join(data_dir,'test','test_labels.npy'), test_labels)

In [9]:
from sklearn.model_selection import StratifiedShuffleSplit

value_idx, eval_idx = list(StratifiedShuffleSplit(n_splits=1, train_size=0.1).split(train_data, train_labels))[0]

value_data = train_data.loc[value_idx,:]
eval_data = train_data.loc[eval_idx,:]

value_labels = train_labels[value_idx]
eval_labels = train_labels[eval_idx]

In [10]:
value_data.to_pickle(join(data_dir, 'train', 'value_data.pkl'))
eval_data.to_pickle(join(data_dir, 'train', 'eval_data.pkl'))

In [11]:
np.save(join(data_dir, 'train', 'value_labels.npy'), value_labels)
np.save(join(data_dir, 'train', 'eval_labels.npy'), eval_labels)

# LOOV

In [12]:
train_data = pd.read_pickle(join(data_dir,'train','train_data.pkl'))
test_data = pd.read_pickle(join(data_dir,'test','test_data.pkl'))

In [13]:
train_labels = np.load(join(data_dir,'train','train_labels.npy'))
test_labels = np.load(join(data_dir,'test','test_labels.npy'))

In [14]:
value_data = pd.read_pickle(join(data_dir, 'train', 'value_data.pkl'))
eval_data = pd.read_pickle(join(data_dir, 'train', 'eval_data.pkl'))

In [15]:
value_labels = np.load(join(data_dir, 'train', 'value_labels.npy'))
eval_labels = np.load(join(data_dir, 'train', 'eval_labels.npy'))

In [16]:
%load_ext autoreload
%autoreload 2
from DShap import DShap

In [17]:
import pickle as pck

model_families = ['NB', 'LinearSVC']
metrics = ['accuracy', 'auc']

for model_family in model_families:
    for metric in metrics:
        if model_family == 'LinearSVC' and metric == 'auc':
            continue
        for seed in range(1,4):
            dshap = DShap(value_data.to_numpy(), value_labels, eval_data.to_numpy(), eval_labels, 1162, 
                          directory=join('output','cycling',model_family,metric),
                          model_family=model_family, metric=metric, seed=seed, n_neighbors=3)
            dshap.run(100, 0.1)
        dshap.merge_results()
        
        fig = dshap.performance_plots([dshap.values_tmc, dshap.vals_loo], num_plot_markers=20,
                                      sources=None, order='d')
        plt.savefig(join('output','cycling',model_family,metric,'plots','descend.pdf'), bbox_inches='tight')
        
        fig = dshap.performance_plots([dshap.values_tmc, dshap.vals_loo], num_plot_markers=20,
                                      sources=None, order='a')
        plt.savefig(join('output','cycling',model_family,metric,'plots','ascend.pdf'), bbox_inches='tight')
        
        values = {}
        values['loo'] = dshap.vals_loo
        values['shapley'] = dshap.values_tmc
        
        pck.dump(values, open(join('output','cycling',model_family,metric,'values.pkl'), 'wb'))

  0%|          | 0/100 [00:00<?, ?it/s]

LOO values calculated!


100%|██████████| 100/100 [00:10<00:00,  9.45it/s]
100%|██████████| 100/100 [00:11<00:00,  8.87it/s]
100%|██████████| 100/100 [00:11<00:00,  8.38it/s]
100%|██████████| 100/100 [00:10<00:00,  9.68it/s]
100%|██████████| 100/100 [00:09<00:00, 10.20it/s]
100%|██████████| 100/100 [00:14<00:00,  6.93it/s]
100%|██████████| 100/100 [00:12<00:00,  8.24it/s]
100%|██████████| 100/100 [00:11<00:00,  8.34it/s]
100%|██████████| 100/100 [00:11<00:00,  8.72it/s]
100%|██████████| 100/100 [00:11<00:00,  8.76it/s]
100%|██████████| 100/100 [00:12<00:00,  8.17it/s]
100%|██████████| 100/100 [00:12<00:00,  8.27it/s]
100%|██████████| 100/100 [00:13<00:00,  7.44it/s]
100%|██████████| 100/100 [00:12<00:00,  7.73it/s]
100%|██████████| 100/100 [00:10<00:00,  9.34it/s]
100%|██████████| 100/100 [00:10<00:00,  9.24it/s]
100%|██████████| 100/100 [00:10<00:00,  9.74it/s]
100%|██████████| 100/100 [00:11<00:00,  8.42it/s]
100%|██████████| 100/100 [00:11<00:00,  8.91it/s]
100%|██████████| 100/100 [00:12<00:00,  7.99it/s]


LOO values calculated!


100%|██████████| 100/100 [00:10<00:00,  9.35it/s]
100%|██████████| 100/100 [00:11<00:00,  8.81it/s]
100%|██████████| 100/100 [00:10<00:00,  9.52it/s]
100%|██████████| 100/100 [00:11<00:00,  8.82it/s]
100%|██████████| 100/100 [00:11<00:00,  9.00it/s]
100%|██████████| 100/100 [00:11<00:00,  8.50it/s]
100%|██████████| 100/100 [00:11<00:00,  8.72it/s]
100%|██████████| 100/100 [00:09<00:00, 10.09it/s]
100%|██████████| 100/100 [00:10<00:00,  9.34it/s]
100%|██████████| 100/100 [00:11<00:00,  8.51it/s]
100%|██████████| 100/100 [00:10<00:00,  9.58it/s]
100%|██████████| 100/100 [00:11<00:00,  8.93it/s]
100%|██████████| 100/100 [00:11<00:00,  8.65it/s]
100%|██████████| 100/100 [00:11<00:00,  8.65it/s]
100%|██████████| 100/100 [00:10<00:00,  9.58it/s]
100%|██████████| 100/100 [00:10<00:00,  9.79it/s]
100%|██████████| 100/100 [00:10<00:00,  9.40it/s]
100%|██████████| 100/100 [00:10<00:00,  9.68it/s]
  1%|          | 1/100 [00:00<00:10,  9.23it/s]

LOO values calculated!


100%|██████████| 100/100 [00:09<00:00, 10.00it/s]
100%|██████████| 100/100 [00:10<00:00,  9.21it/s]
100%|██████████| 100/100 [00:09<00:00, 10.25it/s]
100%|██████████| 100/100 [00:10<00:00,  9.51it/s]
100%|██████████| 100/100 [00:09<00:00, 10.78it/s]
100%|██████████| 100/100 [00:10<00:00,  9.83it/s]
100%|██████████| 100/100 [00:09<00:00, 10.17it/s]
100%|██████████| 100/100 [00:10<00:00,  9.37it/s]
100%|██████████| 100/100 [00:10<00:00,  9.89it/s]
100%|██████████| 100/100 [00:10<00:00,  9.69it/s]
100%|██████████| 100/100 [00:10<00:00,  9.18it/s]
100%|██████████| 100/100 [00:09<00:00, 10.07it/s]
100%|██████████| 100/100 [00:09<00:00, 10.03it/s]
100%|██████████| 100/100 [00:11<00:00,  8.56it/s]
100%|██████████| 100/100 [00:10<00:00,  9.56it/s]


output/cycling/NB/accuracy/mem_tmc_0000.pkl
output/cycling/NB/accuracy/mem_tmc_0001.pkl
output/cycling/NB/accuracy/mem_tmc_0002.pkl
output/cycling/NB/accuracy/mem_tmc_0003.pkl
LOO values calculated!


100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:45<00:00,  2.22it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:44<00:00,  2.23it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:48<00:00,  2.08it/s]
100%|██████████| 100/100 [00:44<00:00,  2.26it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

LOO values calculated!


100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
100%|██████████| 100/100 [00:44<00:00,  2.24it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:47<00:00,  2.11it/s]
100%|██████████| 100/100 [00:47<00:00,  2.10it/s]
100%|██████████| 100/100 [00:44<00:00,  2.23it/s]
100%|██████████| 100/100 [00:47<00:00,  2.11it/s]
100%|██████████| 100/100 [00:45<00:00,  2.17it/s]
100%|██████████| 100/100 [00:44<00:00,  2.24it/s]
100%|██████████| 100/100 [00:46<00:00,  2.15it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
  0%|          | 0/100 [00:00<?, ?it/s]

LOO values calculated!


100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
100%|██████████| 100/100 [00:44<00:00,  2.27it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:46<00:00,  2.14it/s]
100%|██████████| 100/100 [00:45<00:00,  2.19it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:44<00:00,  2.27it/s]
100%|██████████| 100/100 [00:45<00:00,  2.21it/s]
100%|██████████| 100/100 [00:46<00:00,  2.17it/s]
100%|██████████| 100/100 [00:46<00:00,  2.16it/s]
100%|██████████| 100/100 [00:43<00:00,  2.30it/s]
100%|██████████| 100/100 [00:47<00:00,  2.11it/s]
100%|██████████| 100/100 [00:44<00:00,  2.25it/s]
100%|██████████| 100/100 [00:45<00:00,  2.18it/s]
100%|██████████| 100/100 [00:43<00:00,  2.31it/s]


output/cycling/NB/auc/mem_tmc_0000.pkl
output/cycling/NB/auc/mem_tmc_0001.pkl
output/cycling/NB/auc/mem_tmc_0002.pkl
output/cycling/NB/auc/mem_tmc_0003.pkl
output/cycling/NB/auc/mem_tmc_0004.pkl
Starting LOO score calculations!


  0%|          | 0/100 [00:00<?, ?it/s]

LOO values calculated!


100%|██████████| 100/100 [02:05<00:00,  1.26s/it]
100%|██████████| 100/100 [02:11<00:00,  1.31s/it]
100%|██████████| 100/100 [02:08<00:00,  1.28s/it]
100%|██████████| 100/100 [02:10<00:00,  1.30s/it]
100%|██████████| 100/100 [02:10<00:00,  1.31s/it]
100%|██████████| 100/100 [02:12<00:00,  1.33s/it]
100%|██████████| 100/100 [02:11<00:00,  1.32s/it]
100%|██████████| 100/100 [02:13<00:00,  1.33s/it]
100%|██████████| 100/100 [02:10<00:00,  1.31s/it]
100%|██████████| 100/100 [02:10<00:00,  1.31s/it]
100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
100%|██████████| 100/100 [02:13<00:00,  1.33s/it]
100%|██████████| 100/100 [02:10<00:00,  1.30s/it]
100%|██████████| 100/100 [02:09<00:00,  1.30s/it]
100%|██████████| 100/100 [02:09<00:00,  1.29s/it]
100%|██████████| 100/100 [02:10<00:00,  1.30s/it]
100%|██████████| 100/100 [02:45<00:00,  1.66s/it]
100%|██████████| 100/100 [02:07<00:00,  1.28s/it]
100%|██████████| 100/100 [02:08<00:00,  1.28s/it]
100%|██████████| 100/100 [02:11<00:00,  1.32s/it]


LOO values calculated!


100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:15<00:00,  1.36s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:16<00:00,  1.37s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:15<00:00,  1.36s/it]
100%|██████████| 100/100 [02:11<00:00,  1.32s/it]
100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:16<00:00,  1.37s/it]
100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:15<00:00,  1.35s/it]
100%|██████████| 100/100 [02:14<00:00,  1.34s/it]
100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:15<00:00,  1.36s/it]
100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
100%|██████████| 100/100 [02:12<00:00,  1.32s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:15<00:00,  1.36s/it]


LOO values calculated!


100%|██████████| 100/100 [02:17<00:00,  1.37s/it]
100%|██████████| 100/100 [02:13<00:00,  1.34s/it]
100%|██████████| 100/100 [02:11<00:00,  1.32s/it]
100%|██████████| 100/100 [02:13<00:00,  1.33s/it]
100%|██████████| 100/100 [02:15<00:00,  1.36s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:16<00:00,  1.37s/it]
100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:15<00:00,  1.35s/it]
100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:15<00:00,  1.36s/it]
100%|██████████| 100/100 [02:14<00:00,  1.35s/it]
100%|██████████| 100/100 [02:17<00:00,  1.37s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:14<00:00,  1.34s/it]
100%|██████████| 100/100 [02:12<00:00,  1.33s/it]
100%|██████████| 100/100 [02:16<00:00,  1.36s/it]
100%|██████████| 100/100 [02:12<00:00,  1.33s/it]
100%|██████████| 100/100 [02:15<00:00,  1.36s/it]


output/cycling/LinearSVC/accuracy/mem_tmc_0000.pkl
output/cycling/LinearSVC/accuracy/mem_tmc_0001.pkl
output/cycling/LinearSVC/accuracy/mem_tmc_0002.pkl


In [18]:
from scipy import stats

loo_table = []
shp_table = []

for model_family in model_families:
    for metric in metrics:
        if model_family == 'LinearSVC' and metric == 'auc':
            continue
        values = pck.load(open(join('output','cycling',model_family,metric,'values.pkl'), 'rb'))
        loo_stats = stats.describe(values['loo'])
        shp_stats = stats.describe(values['shapley'])
        loo_table.append(['cycling', model_family, metric, loo_stats[1][0], loo_stats[1][1], loo_stats[2], loo_stats[3]])
        shp_table.append(['cycling', model_family, metric, shp_stats[1][0], shp_stats[1][1], shp_stats[2], shp_stats[3]])

loo_table = pd.DataFrame(loo_table, columns=['Data', 'Algorithm', 'Metric', 'Min', 'Max', 'Mean', 'Variance'])
shp_table = pd.DataFrame(shp_table, columns=['Data', 'Algorithm', 'Metric', 'Min', 'Max', 'Mean', 'Variance'])

In [19]:
loo_table.to_latex(index=False, float_format="%.2e")

'\\begin{tabular}{lllrrrr}\n\\toprule\n    Data &  Algorithm &    Metric &       Min &      Max &      Mean &  Variance \\\\\n\\midrule\n cycling &         NB &  accuracy & -8.61e-03 & 4.30e-03 & -9.77e-04 &  1.65e-06 \\\\\n cycling &         NB &       auc & -9.94e-03 & 5.25e-03 &  5.57e-04 &  7.44e-06 \\\\\n cycling &  LinearSVC &  accuracy & -2.75e-02 & 3.36e-01 &  7.60e-02 &  6.85e-03 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [20]:
shp_table.to_latex(index=False, float_format="%.2e")

'\\begin{tabular}{lllrrrr}\n\\toprule\n    Data &  Algorithm &    Metric &       Min &      Max &      Mean &  Variance \\\\\n\\midrule\n cycling &         NB &  accuracy & -1.22e-02 & 5.79e-03 &  8.85e-04 &  8.74e-06 \\\\\n cycling &         NB &       auc & -2.60e-02 & 1.19e-02 & -3.49e-04 &  5.36e-05 \\\\\n cycling &  LinearSVC &  accuracy & -1.72e-02 & 5.26e-03 &  6.16e-04 &  1.35e-05 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [21]:
for model_family in model_families:
    for metric in metrics:
        if model_family == 'LinearSVC' and metric == 'auc':
            continue
        values = pck.load(open(join('output','cycling',model_family,metric,'values.pkl'), 'rb'))['shapley']
        plt.figure()
        plt.style.use('ggplot')
        plt.hist(values, bins=10)
        plt.savefig(join('output','cycling',model_family,metric,'plots','hist.pdf'))

In [180]:
values = pck.load(open(join('output','cycling','NB','accuracy','values.pkl'), 'rb'))['shapley'].reshape(-1,1)

In [181]:
athletes = pd.read_csv(join(data_dir, 'train', 'train_activities.csv'))['athlete_id'].to_numpy()[value_data.index].reshape(-1,1)

In [182]:
df = pd.DataFrame(np.concatenate((athletes, values), axis=1), columns=['Athlete', 'Values']).astype({'Athlete' : 'int64'})

In [183]:
grouped = df.groupby('Athlete')

In [184]:
means = grouped.min()

In [185]:
grouped.describe()['Values'][['count', 'min', 'max', 'mean', 'std']].to_latex(float_format="%.2e")

'\\begin{tabular}{lrrrrr}\n\\toprule\n{} &    count &       min &      max &     mean &      std \\\\\nAthlete &          &           &          &          &          \\\\\n\\midrule\n1       & 4.50e+01 & -9.11e-03 & 5.56e-03 & 1.07e-03 & 2.68e-03 \\\\\n2       & 3.50e+01 & -1.22e-02 & 4.23e-03 & 5.20e-04 & 3.31e-03 \\\\\n3       & 1.20e+01 & -5.79e-03 & 4.91e-03 & 8.38e-04 & 3.42e-03 \\\\\n4       & 4.70e+01 & -1.03e-02 & 4.81e-03 & 1.09e-03 & 3.05e-03 \\\\\n5       & 3.20e+01 & -5.04e-03 & 4.35e-03 & 1.13e-03 & 2.20e-03 \\\\\n6       & 2.20e+01 & -7.63e-03 & 5.79e-03 & 3.12e-04 & 3.56e-03 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [186]:
for group in grouped:
    plt.figure()
    plt.hist(group[1]['Values'])
    plt.savefig(join('output', 'cycling', 'athlete_' + str(group[0]) + '.pdf'), bbox_inches='tight')
    plt.close()

In [187]:
X_all = value_data.to_numpy()
y_all = value_labels

X_test = eval_data.to_numpy()[:-1162]
y_test = eval_labels[:-1162]

In [188]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

scores = []
all_score = 100*model.fit(X_all, y_all).score(X_test, y_test)

for group in grouped:
    athlete = group[0]
    X_out = value_data.to_numpy()[athletes.reshape(-1) != athlete, :]
    y_out = value_labels[athletes.reshape(-1) != athlete]
    
    out_score = 100*model.fit(X_out, y_out).score(X_test, y_test)
    scores.append([out_score, out_score - all_score])

scores = pd.DataFrame(scores, columns=['Accuracy (%)', 'Change (%)'], index=[group[0] for group in grouped])
scores.index.name = 'Athlete'

In [189]:
scores.to_latex(float_format="%.2f")

'\\begin{tabular}{lrr}\n\\toprule\n{} &  Accuracy (\\%) &  Change (\\%) \\\\\nAthlete &               &             \\\\\n\\midrule\n1       &         76.12 &        0.17 \\\\\n2       &         72.16 &       -3.78 \\\\\n3       &         76.12 &        0.17 \\\\\n4       &         75.09 &       -0.86 \\\\\n5       &         75.60 &       -0.34 \\\\\n6       &         75.77 &       -0.17 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [178]:
scores

Unnamed: 0_level_0,Accuracy (%),Change (%)
Athlete,Unnamed: 1_level_1,Unnamed: 2_level_1
1,76.116838,0.171821
2,72.164948,-3.780069
3,76.116838,0.171821
4,75.085911,-0.859107
5,75.601375,-0.343643
6,75.773196,-0.171821
