In [1]:
%load_ext autoreload
%autoreload 2

## Set root dir path

In [2]:
import os
ROOT='../..'
os.chdir(ROOT)

In [3]:
%run config/initialize_nospark.ipynb

from copy import deepcopy
from datetime import date
import json

In [4]:
sys.path.append('model_pipeline')
from Ensemble import Ensemble
from ExecuteModelPipeline import ExecuteModelPipeline

In [5]:
# toggle whether to train the models 
# or just create the JSON files
DO_EXECUTE=False

#### Location of base model

In [6]:
MODELS_DIR='models'
MODEL_ID = '{}_regression'.format(str(date.today()))

## Create dictionary version of model.json

In [7]:
model_dict = {
    'model_id': MODEL_ID,
    'models_dir': MODELS_DIR
}

### Source data for model
* features
* labels
* Note: can modify/create tables here

In [8]:
model_dict['features_tbl'] = 'features.combined_0601'
model_dict['labels_tbl'] = 'labels.combined_0601'

In [9]:
all_features = list(pd.read_csv('data/features/combined_0601.csv').columns[3:])
len(all_features)

123

#### Columns from tables
* index: unique identifier in features/labels table (must be in both)
* label column, and indicator of what is a positive label
  * currently not supported: multi-class
  * code will binarize
* list of features

In [10]:
model_dict['index'] = ['game_id']
model_dict['label_col'] = 'did_win'
model_dict['pos_labels'] = [1]
model_dict['neg_labels'] = [-1]
model_dict['features_list'] = all_features

model_dict['features_list'].sort()

### Cross-Validation Sets
* random seeds for reproducibility
* number of folds for cross-validation (value of <= 1 doesn't do k-fold
* global_dataset_proportions
 * proportion of the data for each of training, scoring only, holdout, and throwaway
 * generated using stratified sampling
* dimensional_dataset_proportions
 * post-processing after global_dataset_proportions
 * idea is to move specific field values, e.g. move certain seasons to the holdout set

#### Optional: use cross-validation data from another model

In [11]:
model_dict['model_cv_to_use'] = None

#### CV parameters, when not using another model CV

#### Sample usage for Dimensional Dataset Proportions
```python
model_dict['dimensional_dataset_proportions'] = {
        'throw_away': [
            {
                'vals': [
                    0
                ], 
                'dim': 'is_home',
                'prop_to_move': 1.0, 
                'from_groups': [
                    'training',
                    'holdout',
                    'scoring_only'
                ]
            }
        ]
    }
```

In [12]:
model_dict['fold_seed'] = 99
model_dict['dataset_seed'] = 9
model_dict['kfolds'] = 5
model_dict['strata_cols'] = ['season','week_id']

model_dict['global_dataset_proportions'] = {
        'training': 1.,
        'holdout': 0,
        'throw_away': 0,
        'scoring_only': 0
    }

# DEFAULT: model_dict['dimensional_dataset_proportions'] = {}
model_dict['dimensional_dataset_proportions'] = {
        'holdout': [
            {
                'vals': [
                    17, 
                    18, 
                    19, 
                    20, 
                    21, 
                    22
                ], 
                'dim': 'week_id', 
                'prop_to_move': 1.0, 
                'from_groups': [
                    'training', 
                ]
            }
        ],
        'holdout': [{
            'vals': [2016,2017],
            'dim': 'season',
            'prop_to_move': 1.0,
            'from_groups': ['training', 'scoring_only']}]
    }

### Model Choice
* package/class name as a string
* parameters as a dictionary

#### sklearn basic

In [13]:
# model_dict['model'] = 'sklearn.ensemble.GradientBoostingClassifier'
# model_dict['model_params'] = {
#     'learning_rate': 0.1, 
#     'n_estimators': 200, 
#     'max_features': 'auto', 
#     'subsample': 0.9, 
#     'random_state': 9, 
#     'max_depth': 12, 
# }

#### xgboost basic

In [14]:
model_dict['model'] = 'xgboost.XGBClassifier'
model_dict['model_params'] = {
        'n_jobs': 1,
        'learning_rate': 0.1,
        'n_estimators': 100, 
        'max_features': 'auto', 
        'booster': 'gbtree', 
        'silent': True, 
        'nthread': None, 
        'subsample': 0.5, 
        'random_state': 9, 
        'objective': 'reg:linear',
        'max_depth': 6, 
        'gamma': 0
}

#### Actions to perform and data to save

In [15]:
model_dict['save'] = {
    'cv_data': True,
    'serialized_models': False,
    'cv_scores': True,
    'holdout_scores': False
}

model_dict['actions'] = {
        'do_train_and_score_cv': True,
        'do_score_holdout': False,
        'do_evaluate': False
    }

### Write out model.json file

In [16]:
model_json_path = 'model_configs/model__classification_example.json'
model_json_path

with open(model_json_path,'w') as w:
    json.dump(model_dict, w, indent=4)

'model_configs/model__classification_example.json'

## Generate dictionary version of evaluate.json

In [17]:
evaluate_dict = {
    'model_id': MODEL_ID,
    'models_dir': MODELS_DIR,
    'regression_evaluation': {}
}

### Plot Labels
* labels --> names (note: keys should be strings)
* name for success rate

In [18]:
evaluate_dict['label_map'] = {
    '1': 'Won',
    '0': 'Lost'
}
evaluate_dict['success_name'] = 'Win Rate'

### Bins to plot
* plot_bins: 
   * Number of bins to plot (i.e. number of bars on the bar chart)
* bin_types:
   * "Bin" puts scores into uniform bins, e.g. [0, 0.10], (0.10, 0.20], ..., (0.9, 1.0]
   * "Percentile" bins scores into ntiles determined by plot_bins

In [19]:
evaluate_dict['bin_types'] = ['Bin', 'Percentile']
evaluate_dict['plot_bins'] = [10, 100]

### Threshold Metrics to Plot
* metrics evaluated at each of 100 score threshold points
* currently only supports Accuracy and F1

In [20]:
evaluate_dict['threshold_metrics'] = ['Accuracy','F1']

### Accuracy at Top 'N' plots

In [21]:
evaluate_dict['accuracy_at_topn'] = {
        'week_id__season': [1, 16],
        'season': [1, 200, 2]
}

In [22]:
evaluate_dict['to_plot'] = {
    'ridge': True,
    'thresholds': True,
    'bins': True,
    'roc': True,
    'accuracy_by_top_n': True,
    'shap__feature_importance': True,
    'shap__dependence_plots': False,
    'feature_importance': True
}

evaluate_dict['save'] = {
    'plots': False,
    'data': False
}

### Write out evaluate.json

In [23]:
eval_json_path = 'model_configs/evaluate__classification_example.json'
eval_json_path

with open(eval_json_path,'w') as w:
    json.dump(evaluate_dict, w, indent=4)

'model_configs/evaluate__classification_example.json'

## Execute pipeline

In [24]:
if DO_EXECUTE:
    model_pipeline = ExecuteModelPipeline(model_json_path, eval_json_path, 'Y')
    model_pipeline.execute_model_pipeline()

## Make an ensemble, using the above model as a template

### Create the sub-model template: modify model.json (model_dict)

##### Bootstrap: select 50% of the data for each model, with replacement

In [25]:
model_dict['global_dataset_proportions']['training'] = 0.5
model_dict['global_dataset_proportions']['throw_away'] = 0.5

##### save options for ensemble's sub-models

In [26]:
model_dict['save'] = {
    'cv_data': True,
    'serialized_models': False,
    'cv_scores': True,
    'holdout_scores': False
}
model_dict['actions'] = {
    'do_train_and_score_cv': True,
    'do_score_holdout': False,
    'do_evaluate': False
}

#### Store ensemble submodel JSON for use in the ensemble

In [27]:
ensemble_submodel_path = 'model_configs/ensemble_submodel_model__classification_example.json'
ensemble_submodel_path
json.dump(model_dict, open(ensemble_submodel_path,'w'))

'model_configs/ensemble_submodel_model__classification_example.json'

### Modify evaluate.json (evaluate_dict) if needed
* toggle which plots to include
* toggle whether to save evaluation data
* toggle whether to save plots as png files or display (in notebook only)

In [28]:
# evaluate_dict['to_plot'] = {
#     'ridge': True,
#     'thresholds': True,
#     'bins': True,
#     'roc': True,
#     'accuracy_by_top_n': True,
#     'shap__feature_importance': True,
#     'shap__dependence_plots': False,
#     'feature_importance': True
# }

# evaluate_dict['save'] = {
#     'plots': False,
#     'data': False
# }

# evaluate_dict['models_dir'] = f'/Users/{user}/Dropbox/data_science/modeling-football-outcomes/models'

#### Store ensemble submodel evaluation JSON for use in the ensemble

In [29]:
ensemble_submodel_eval_path = 'model_configs/ensemble_submodel_evaluate__classification_example.json'
ensemble_submodel_eval_path
json.dump(evaluate_dict, open(ensemble_submodel_eval_path,'w'))

'model_configs/ensemble_submodel_evaluate__classification_example.json'

### Create ensemble_evaluate.json (ensemble_eval)

In [30]:
ensemble_eval = deepcopy(evaluate_dict)

ensemble_eval['save'] = {
    'plots': False,
    'data': True
}    

ensemble_eval['to_plot'] = {
    'ridge': True,
    'thresholds': True,
    'bins': True,
    'roc': True,
    'accuracy_by_top_n': True,
    'shap__feature_importance': True,
    'shap__dependence_plots': True,
    'feature_importance': True
}

#### Store ensemble evaluation JSON

In [31]:
ensemble_evaluate_json_path = 'model_configs/ensemble_evaluate__classification_example.json'
ensemble_evaluate_json_path
with open(ensemble_evaluate_json_path, 'w') as w:
    json.dump(ensemble_eval, w)

'model_configs/ensemble_evaluate__classification_example.json'

## Create ensemble by generating new CV data
### Create ensemble.json (ensemble_dict) 

In [32]:
ensemble_dict = {}
ensemble_dict['models_dir'] = f'models'
ensemble_dict['ensemble_model_id'] = 'classification_ensemble_with_new_cv_data'
ensemble_dict['number_of_models'] = 5
ensemble_dict['aggregation_method'] = ['mean', 'median'] # mean, median, max, min, mean excluding top/bottom n (robust mean?)
ensemble_dict['source'] = ensemble_submodel_path
ensemble_dict['save'] = {'scores': True}

ensemble_dict['evaluation_config'] = ensemble_submodel_eval_path
ensemble_dict['submodel_plots'] = True

assert os.path.exists(ensemble_dict['models_dir'])
assert not set(ensemble_dict['aggregation_method']) - set(['mean','median','min','max'])
if 'load_cv_data' not in ensemble_dict.keys():
    assert (type(ensemble_dict['source']) is str) | (len(ensemble_dict['source']) == ensemble_dict['number_of_models'])

#### Example of modifying ensemble.json: each model in the ensemble gets a random 5 features (ensemble_dict['input_changes_by_iteration']['features_list'] contains a list of N lists of 5 random features each.
* this can be used for the output of feature selection, e.g. take the top 10 feature sets and ensemble those

In [33]:
# features_list = pd.read_csv('data/{}/{}.csv'.format(
#     *model_dict['features_tbl'].split('.'))
# ).columns.tolist()[3:]

# features_lists = [
#     list(set(np.random.choice(features_list, size=5).tolist()))
#     for _ in range(ensemble_dict['number_of_models'])
# ]

# ensemble_dict['input_changes_by_iteration'] = {
#     'features_list': features_lists
# }

# # test
# if 'input_changes_by_iteration' in ensemble_dict:
#     assert type(ensemble_dict['input_changes_by_iteration']) is dict
#     for param, values in ensemble_dict['input_changes_by_iteration'].items():
#         assert len(values) == ensemble_dict['number_of_models']
#         for value in values:
#             assert type(value) == type(model_dict[param])

#### Store ensemble JSON for generating new cross-validation data

In [34]:
ensemble_model_json_path = 'model_configs/ensemble_model_new_cv__classification_example.json'
ensemble_model_json_path
with open(ensemble_model_json_path, 'w') as w:
    json.dump(ensemble_dict, w)

'model_configs/ensemble_model_new_cv__classification_example.json'

### Execute Ensemble

In [35]:
if DO_EXECUTE:
    ensemble = Ensemble(ensemble_model_json_path, ensemble_evaluate_json_path)
    ensemble.execute_ensemble()
    ensemble.evaluate_ensemble()

## Copy ensemble CV data for a new ensemble
#### Use cases
* hyperparameter optimization (change only the hyperparameters in each sub-model's model.json file)
* __feature selection if base models (and cv_data) include all features__

### Create ensemble.json (ensemble_dict) for a new ensemble that loads CV data from another ensemble

In [36]:
ensemble_dict_load_cv = {}
ensemble_dict_load_cv['models_dir'] = f'models'
ensemble_dict_load_cv['ensemble_model_id'] = 'ensemble_load_cv'
ensemble_dict_load_cv['load_cv_data_from'] = ensemble_dict['ensemble_model_id']
ensemble_dict_load_cv['number_of_models'] = 5
ensemble_dict_load_cv['save'] = {'scores': True}
ensemble_dict_load_cv['evaluation_config'] = ensemble_evaluate_json_path
ensemble_dict_load_cv['submodel_plots'] = False
ensemble_dict_load_cv['aggregation_method'] = ['mean', 'median'] # mean, median, max, min, mean excluding top/bottom n (robust mean?)

assert os.path.exists(ensemble_dict_load_cv['models_dir'])
assert not set(ensemble_dict_load_cv['aggregation_method']) - set(['mean','median','min','max'])
if 'load_cv_data_from' in ensemble_dict_load_cv.keys():
    assert os.path.exists(
        os.path.join(ensemble_dict_load_cv['models_dir'], 
                     ensemble_dict_load_cv['load_cv_data_from'])
        )
    
    source_path = os.path.join(ensemble_dict_load_cv['models_dir'], 
                               ensemble_dict_load_cv['load_cv_data_from'])
    n_models_expected = 0
    for d in os.listdir(source_path):
        try:
            _ = int(d)
            n_models_expected += 1
        except:
            pass
    assert ensemble_dict_load_cv['number_of_models'] == n_models_expected

In [37]:
params_lists = []
for n in range(ensemble_dict['number_of_models']):
    model_dict['model_params']['max_depth'] = 12
    params_lists.append(model_dict['model_params'])

ensemble_dict_load_cv['input_changes_by_iteration'] = {
    'model_params': params_lists
}

# # test
if 'input_changes_by_iteration' in ensemble_dict:
    assert type(ensemble_dict['input_changes_by_iteration']) is dict
    for param, values in ensemble_dict['input_changes_by_iteration'].items():
        assert len(values) == ensemble_dict['number_of_models']
        for value in values:
            assert type(value) == type(model_dict[param])

#### Store ensemble model JSON for existing cross-validation data

In [38]:
ensemble_model_json_path = 'model_configs/ensemble_model_load_cv__classification_example.json'
ensemble_model_json_path
with open(ensemble_model_json_path, 'w') as w:
    json.dump(ensemble_dict_load_cv, w)

'model_configs/ensemble_model_load_cv__classification_example.json'

In [39]:
if DO_EXECUTE:
    ensemble = Ensemble(ensemble_model_json_path, ensemble_evaluate_json_path)
    ensemble.execute_ensemble()
    ensemble.evaluate_ensemble()