In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%run ../../config/initialize_nospark.ipynb

import json
import shutil

In [3]:
sys.path.append('../../config')
from mpl_style import *
colors = rcParams['axes.prop_cycle'].by_key()['color']

rcParams['figure.dpi'] = 96
rcParams['figure.figsize'] = (12,8)

In [4]:
import matplotlib as mpl
params_backup = rcParams.copy()

In [5]:
os.chdir('/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/')

In [6]:
sys.path.append('model_pipeline')
from Ensemble import Ensemble
from EvaluateModel import EvaluationData, EvaluateAndPlot
from ExecuteModelPipeline import ExecuteModelPipeline

### TODO:
* check JSON
  * assert all labels tables are the same
* some way of loading existing CV... maybe a dict of cv directories with files to skip CV part (make false) and copy files in... instead of changing seed, this would leave it constant
  * assert do cv is True OR there are N CVs to load
* clean up the ensemble execution part

### Ideas

#### multi-thread batch?
#### try DL on it? optimization, bagging

## Configs

### Base Model

In [72]:
base_model = json.load(open('models/0625/model.json'))
base_model['global_dataset_proportions']['training'] = 0.5
base_model['global_dataset_proportions']['throw_away'] = 0.5

base_model['save'] = {
    'cv_data': False,
    'serialized_models': False,
    'cv_scores': True,
    'holdout_scores': False
}
base_model['actions'] = {
    'do_train_and_score_cv': True,
    'do_score_holdout': False,
    'do_evaluate': True
}

json.dump(base_model, open('model_configs/base_model.json','w'))

In [73]:
base_model['label_col'] = 'did_win'
base_model['model'] = 'xgboost.XGBClassifier'
base_model['model_params'] = {'n_jobs': 1,
  'learning_rate': 0.1,
  'n_estimators': 100,
  'max_features': 'auto',
  'booster': 'gbtree',
  'silent': True,
  'nthread': None,
  'subsample': 0.5,
  'random_state': 9,
  'objective': 'binary:logistic',
  'max_depth': 6,
  'gamma': 0}
base_model['actions']['do_evaluate'] = False
base_model['save']['cv_data'] = True

json.dump(base_model, open('model_configs/base_model_classification.json','w'))

### Base Eval

In [74]:
base_eval = json.load(open('models/0625/evaluate.json'))
base_eval['to_plot'] = {
    'ridge': True,
    'thresholds': True,
    'bins': True,
    'roc': True,
    'accuracy_by_top_n': True,
    'regression__distributions': True,
    'regression__scatter': True,
    'regression__residuals_by_season_week': True,
    'regression__confusion_matrix': True,
    'shap__feature_importance': True,
    'shap__dependence_plots': False,
    'feature_importance': True
}
base_eval['save']['plots'] = False
base_eval['save']['data'] = False
json.dump(base_eval, open('model_configs/base_eval.json','w'))

### Ensemble - load CV data from disk

In [157]:
ensemble_dict = {}
ensemble_dict['models_dir'] = '/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models'
ensemble_dict['ensemble_model_id'] = 'ensemble_0803'
ensemble_dict['load_cv_data_from'] = 'ensemble_0630'
ensemble_dict['number_of_models'] = 5
ensemble_dict['save'] = {'scores': True, 'plots': True}
ensemble_dict['evaluation_config'] = 'model_configs/base_eval.json'
ensemble_dict['submodel_plots'] = True
ensemble_dict['aggregation_method'] = ['mean', 'median'] # mean, median, max, min, mean excluding top/bottom n (robust mean?)

assert os.path.exists(ensemble_dict['models_dir'])
assert not set(ensemble_dict['aggregation_method']) - set(['mean','median','min','max'])
if 'load_cv_data_from' in ensemble_dict.keys():
    assert os.path.exists(
        os.path.join(ensemble_dict['models_dir'], 
                     ensemble_dict['load_cv_data_from'])
        )
    
    source_path = os.path.join(ensemble.config['models_dir'], 
                               ensemble.config['load_cv_data_from'])
    n_models_expected = 0
    for d in os.listdir(source_path):
        try:
            _ = int(d)
            n_models_expected += 1
        except:
            pass
    assert ensemble_dict['number_of_models'] == n_models_expected

#### Execute Ensemble for method loading CV data

In [142]:
ensemble = Ensemble(ensemble_dict)

In [143]:
ensemble.trial_path

'/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0803'

In [144]:
def get_path(model_name):
    return os.path.join(ensemble.config['models_dir'], ensemble.config[model_name])

def get_json_file_paths(source_path, d):
    files = os.listdir(os.path.join(source_path, d))
    return [f for f in files if f.endswith('.json')]

In [145]:
ensemble.setup_trial_dir(ensemble.get_trial_path())

removed /Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0803...


In [146]:
source_path = get_path('load_cv_data_from')
dest_path = get_path('ensemble_model_id')

In [156]:
for d in os.listdir(source_path):
    try:
        _ = int(d)
        _ = shutil.copytree(f'{source_path}/{d}/cv_data', 
                        f'{dest_path}/{d}/cv_data')
        for json_file in get_json_file_paths(source_path, d):
                _ = shutil.copyfile(f'{source_path}/{d}/{json_file}', 
                        f'{dest_path}/{d}/{json_file}')
    except:
        pass

#### Leaving off:
* need to have sub-models read cv_data
  * flaw in current process: don't need to copy data, just point the individual JSON configs to read CV data from source model
  * need to have that file tell it to save sub-models

In [148]:
ensemble.train_and_score()

Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00000
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00001
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00002
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00003
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00004
SAVE CV DATA
per model.json, skipping model evaluation...


### Ensemble - generate new CV data
* TODO: how to make it configurable how to handle plotting submodel

In [84]:
ensemble_dict = {}
ensemble_dict['models_dir'] = '/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models'
ensemble_dict['ensemble_model_id'] = 'ensemble_0630'
ensemble_dict['number_of_models'] = 5
ensemble_dict['aggregation_method'] = ['mean', 'median'] # mean, median, max, min, mean excluding top/bottom n (robust mean?)
ensemble_dict['source'] = 'model_configs/base_model_classification.json'
ensemble_dict['save'] = {'scores': True, 'plots': True}

ensemble_dict['evaluation_config'] = 'model_configs/base_eval.json'
ensemble_dict['submodel_evaluation_config'] = 'model_configs/base_eval.json'
ensemble_dict['submodel_plots'] = True

assert os.path.exists(ensemble_dict['models_dir'])
assert not set(ensemble_dict['aggregation_method']) - set(['mean','median','min','max'])
if 'load_cv_data' not in ensemble_dict.keys():
    assert (type(ensemble_dict['source']) is str) | (len(ensemble_dict['source']) == ensemble_dict['number_of_models'])


#### Example: each model in the ensemble gets a random 5 features
* in the JSON, provide a list of n sets of features for the features_list key. 

In [85]:
features_list = pd.read_csv('data/{}/{}.csv'.format(
    *base_model['features_tbl'].split('.'))
).columns.tolist()[3:]

features_lists = [
    list(set(np.random.choice(features_list, size=5).tolist()))
    for _ in range(ensemble_dict['number_of_models'])
]

ensemble_dict['input_changes_by_iteration'] = {
    'features_list': features_lists
}

# test
if 'input_changes_by_iteration' in ensemble_dict:
    assert type(ensemble_dict['input_changes_by_iteration']) is dict
    for param, values in ensemble_dict['input_changes_by_iteration'].items():
        assert len(values) == ensemble_dict['number_of_models']
        for value in values:
            assert type(value) == type(base_model[param])

#### Execute Ensemble for method generating new CV

In [86]:
ensemble = Ensemble(ensemble_dict)
ensemble.setup_trial_dir(ensemble.get_trial_path())
ensemble.create_ensemble_dir_structure()
ensemble.train_and_score()

removed /Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00000
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00001
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00002
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00003
SAVE CV DATA
per model.json, skipping model evaluation...
Model Path:
/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630/00004
SAVE CV DATA
per model.json, skipping model evaluation...


### Evaluate Ensemble
* #### TODO: move this into Ensemble class

In [149]:
ensemble_eval = base_eval.copy()

ensemble_eval['save']['plots'] = True
ensemble_eval['save']['data'] = True
ensemble_eval['model_id'] = ensemble_dict['ensemble_model_id']
ensemble_eval['to_plot'] = {
    'ridge': True,
    'thresholds': True,
    'bins': True,
    'roc': True,
    'accuracy_by_top_n': True,
    'regression__distributions': False,
    'regression__scatter': False,
    'regression__residuals_by_season_week': False,
    'regression__confusion_matrix': False,
    'shap__feature_importance': True,
    'shap__dependence_plots': False,
    'feature_importance': False
}

In [155]:
is_classification = True

for agg_method in ensemble_dict['aggregation_method']:
    scores = ensemble.combine_scores(agg_method)
    scores.to_csv(f'ensemble_scores_{agg_method}.csv')
    
    ensemble_eval['model_id'] = '{}/evaluation_{}'.format(
        ensemble_dict['ensemble_model_id'],
        agg_method
    )
    
    eval_path = os.path.join(ensemble_eval['models_dir'], 
                             ensemble_eval['model_id'])
    if not os.path.exists(eval_path):
        os.mkdir(eval_path)
    
    # somehow change directories
    plot = EvaluateAndPlot(
        ensemble_eval, scores, is_classification
    )

    plot.plot_all(ensemble_eval.get('to_plot', {}))

/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0803/00000/scores/cv_scores.csv


FileNotFoundError: [Errno 2] File b'/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0803/00000/scores/cv_scores.csv' does not exist: b'/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0803/00000/scores/cv_scores.csv'

In [19]:
!ls /Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/ensemble_0630

[1m[34m00000[m[m             [1m[34m00002[m[m             [1m[34m00004[m[m             [1m[34mevaluation_median[m[m
[1m[34m00001[m[m             [1m[34m00003[m[m             [1m[34mevaluation_mean[m[m


In [17]:
foo

NameError: name 'foo' is not defined

### Manual run

In [None]:
base_eval = json.load(open('models/0625/evaluate.json'))

#### Ensemble Dictionary

* load model.json
* create parent directory
* move json to parent directory
* create child directories with individual model.json (optional evaluate.json)
* run pipeline in each child directory
* aggregate
* evaluate

##### TODO: support a list of model.json

In [None]:
def setup_trial_dir(overwrite, trial_path):
    overwrite = True if overwrite.upper()[0] == 'Y' else False

    print('Model Path:\n{}'.format(trial_path))
    if (overwrite is False) & (os.path.exists(trial_path)):
        print('model path already exists and user input disallows overwriting. exiting...')
        sys.exit(1)

    if (overwrite) & (os.path.exists(trial_path)):
        import shutil
        shutil.rmtree(trial_path)
    os.mkdir(trial_path)

In [None]:
bag_dict = ensemble_dict

In [None]:
overwrite = 'Y'
trial_path = os.path.join(bag_dict['models_dir'], bag_dict['ensemble_model_id'])

setup_trial_dir(overwrite, trial_path)

In [None]:
json.dump(bag_dict, open(os.path.join(trial_path, 'bag.json'), 'w'), indent=3)

In [None]:
def create_ensemble_dir_structure(bag_dict, trial_path):
    model_dict = json.load(open(bag_dict['source']))
    seed = np.random.randint(1, 1000000)
    for model_nbr in np.arange(bag_dict['number_of_models']):
        model_dict['dataset_seed'] = int(seed + model_nbr)
        model_dict['fold_seed'] = int(seed + model_nbr)

        model_id = '{:05d}'.format(model_nbr)
        model_path = os.path.join(trial_path, model_id)
        setup_trial_dir('Y', model_path)

        model_dict['model_id'] = '{}/{}'.format(bag_dict['ensemble_model_id'], model_id)
        model_dict['models_dir'] = bag_dict['models_dir']
        json.dump(
            model_dict, 
            open(os.path.join(model_path, 'model.json'), 'w'),
            indent=3
        )

In [None]:
if type(bag_dict['source']) is str:
    create_ensemble_dir_structure(bag_dict, trial_path)

In [None]:
for model_nbr in np.arange(bag_dict['number_of_models']):
    model_id = '{:05d}'.format(model_nbr)
    model_path = os.path.join(trial_path, model_id)
    model_json_path = os.path.join(model_path, 'model.json')
    ExecuteModelPipeline(model_json_path, None, 'Y')

In [None]:
all_scores = pd.read_csv(
    'data/{}/{}.csv'.format(
        *base_model['labels_tbl'].split('.')
    )
)

for model_nbr in np.arange(bag_dict['number_of_models']):
    model_id = '{:05d}'.format(model_nbr)
    model_path = os.path.join(trial_path, model_id)
    model_dict = json.load(open(os.path.join(model_path, 'model.json')))
    scores_path = os.path.join(model_path, 'scores/cv_scores.csv')
    
    scores = pd.read_csv(scores_path)
    cols = [c for c in scores.columns if c in model_dict['index'] or c.endswith('_label') or c.endswith('_score')]
    scores = scores[cols]
    
    score_rename = {c: '{}_{}'.format(model_nbr, c) for c in cols if c.endswith('_score')}
    scores = scores.rename(columns=score_rename)
    
    labels_rename = {c: '{}_{}'.format(model_nbr, c) for c in cols if c.endswith('_label')}
    scores = scores.rename(columns=labels_rename)
    
    all_scores = all_scores.merge(scores, on=model_dict['index'], how='left')

In [None]:
labels_cols = [c for c in all_scores.columns if c.endswith('_label')]
scores_cols = [c for c in all_scores.columns if c.endswith('_score')]

label_col_base = '_'.join(labels_cols[0].split('_')[1:])
score_col_base = '_'.join(scores_cols[0].split('_')[1:])

In [None]:
all_scores[label_col_base] = all_scores[labels_cols]\
                                    .apply(np.nanmean, axis=1)\
                                    .astype(float)
all_scores['label'] = (all_scores[label_col_base] > 0).astype(int)
all_scores_nonnull = all_scores[~all_scores[label_col_base].isnull()]

agg_method = eval('np.nan{}'.format(bag_dict['aggregation_method']))
all_scores_nonnull[score_col_base] = all_scores_nonnull[scores_cols].apply(agg_method, axis=1)

all_scores_prepped = all_scores_nonnull.drop(labels_cols, axis=1)

In [None]:
plot = EvaluateAndPlot(
    base_eval, all_scores_prepped, False
)

plot.plot_ridge()
mpl.rcParams.update(params_backup)

plot.plot_thresholds()
plot.plot_bins()
plot.plot_roc()
plot.plot_accuracy_by_topn()