## Boilerplate

In [1]:
%run initialize.ipynb

import json
import shutil

## TODO
* how to generalize to regression problems?
  * easier to create a separate process?

## Set root dir path

In [2]:
ROOT='/Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/' \
     'models'
os.chdir(ROOT)

## Initialize
* set model ID
* remove this model ID's directory if it exists
* create directory

In [3]:
MODEL_ID = '0115_test_ou'

if os.path.exists(MODEL_ID):
    print 'Directory models/{} **EXISTS**'.format(MODEL_ID)    
else:
    print 'Directory models/{} **DOES NOT EXIST**'.format(MODEL_ID)    

Directory models/0115_test_ou **EXISTS**


In [4]:
## wipe out existing directory

if os.path.exists(MODEL_ID):
    shutil.rmtree(MODEL_ID)
os.mkdir(MODEL_ID)

## Generate dictionary version of model.json

In [5]:
model_dict = {'model_id': MODEL_ID}

### Source data for model

#### Hive Tables
* features
* tables

In [6]:
model_dict['features_tbl'] = 'features.0111_ou_test'
model_dict['labels_tbl'] = 'labels.0111_ou_test'

#### Columns from Hive tables
* index: unique identifier in features/labels table (must be in both)
* label column, and indicator of what is a positive label
  * currently not supported: multi-class
  * code will binarize
* list of features

In [7]:
model_dict['index'] = ['game_id']
model_dict['label_col'] = 'is_sbr_ou_over'
model_dict['pos_labels'] = [1]
model_dict['neg_labels'] = [-1]
model_dict['features_list'] = ['season','sbr_ou']

#### Checks

In [8]:
## assert these fields are of the correct type
assert type(model_dict['index']) is list
assert type(model_dict['label_col']) is str
assert type(model_dict['features_tbl']) is str
assert type(model_dict['features_tbl']) is str
assert type(model_dict['features_list']) is list
assert type(model_dict['pos_labels']) is list

## assert format is schema.table and that
## table exists in hive
for tbl_str in ['features_tbl','features_tbl']:
    schema_and_tbl = model_dict[tbl_str].split('.')
    assert len(schema_and_tbl) == 2
    schema, tbl = schema_and_tbl
    assert spark.sql(
            'show tables in {}'.format(schema)
        ).filter(
            col('tableName') == tbl
        ).count() == 1

feat_cols_set = set(spark.table(model_dict['features_tbl']).columns)
label_cols_set = set(spark.table(model_dict['labels_tbl']).columns)
idx_set = set(model_dict['index'])
feat_set = set(model_dict['features_list'])
label_set = set([model_dict['label_col']])

## assert the chosen columns exist in the
## chosen tables
assert not idx_set - feat_cols_set
assert not idx_set - label_cols_set
assert not feat_set - feat_cols_set
assert not label_set - label_cols_set

## check that positive and negative label values 
## are valid
for label_val in ['pos_labels','neg_labels']:
    assert spark.table(
            model_dict['labels_tbl']
        ).filter(
            col(model_dict['label_col']).isin(model_dict[label_val])
        ).count() > 0

### Cross-Validation Sets
* random seeds for reproducibility
* number of folds for cross-validation (value of <= 1 doesn't do k-fold
* __TODO__: ability to call another's CV set
* global_dataset_proportions
 * proportion of the data for each of training, scoring only, holdout, and throwaway
 * generated using stratified sampling
* dimensional_dataset_proportions
 * post-processing after global_dataset_proportions
 * idea is to move specific field values, e.g. move certain seasons to the holdout set

#### Optional: use cross-validation data from another model

In [9]:
model_dict['model_cv_to_use'] = None

#### CV parameters, when not using another model CV

In [10]:
model_dict['kfold_seed'] = 99
model_dict['dataset_seed'] = 9
model_dict['kfolds'] = 5
model_dict['strata_cols'] = []
model_dict['holdout_set'] = {
    'store_to_disk': False,
    'score_using_full_model': False 
}

model_dict['global_dataset_proportions'] = {
        'in_training': 0.5,
        'holdout': 0.5,
        'throw_away': 0,
        'scoring_only': 0
    }

## SAMPLE USAGE:
_ = '''
model_dict['dimensional_dataset_proportions'] = {
        'throw_away': [
            {
                'vals': [
                    0
                ], 
                'dim': 'is_home',
                'prop_to_move': 1.0, 
                'from_groups': [
                    'in_training',
                    'holdout',
                    'scoring_only'
                ]
            }
        ]
    }
'''
model_dict['dimensional_dataset_proportions'] = {}

#### Checks

In [11]:
assert (model_dict['model_cv_to_use'] is None) \
        | (type(model_dict['model_cv_to_use']) is str)

## TODO: when loading the CV sets, ensure that set(load cvs) - set(current df) is empty

if model_dict['model_cv_to_use']:
    assert os.path.exists(model_dict['model_cv_to_use'])
else:
    ## assert the data structures/types are correct
    assert type(model_dict['kfold_seed']) is int
    assert type(model_dict['dataset_seed']) is int
    assert type(model_dict['kfolds']) is int
    assert type(model_dict['strata_cols']) is list
    assert type(model_dict['global_dataset_proportions']) is dict
    assert type(model_dict['dimensional_dataset_proportions']) is dict
    assert type(model_dict['holdout_set']) is dict

    ## assert strata cols are present in the labels table
    assert not set(model_dict['strata_cols']) - label_cols_set

    dataset_types = set(['in_training','holdout','throw_away','scoring_only'])
    global_datasets = model_dict['global_dataset_proportions']
    dim_datasets = model_dict['dimensional_dataset_proportions']

    ## assert global_dataset_proportions has all possible dataset types
    assert set(global_datasets.keys()) == dataset_types
    ## values are proportions that must sum to 1
    assert sum(global_datasets.values()) == 1
    ## assert that the keys are valid dataset types
    assert not set(dim_datasets.keys()) - dataset_types
    ## assert the following (in order of assertion block):
    ## (1) each value is a list
    ## (2) each element of the list is a dict
    ## (3) each dict has the 5 required keys
    ## (4) the "dim" field is in the strata columns 
    ## (5) "prop_to_move" field is [0, 1]
    ## (6) "from_groups" are in the possible dataset types
    for k, dim_list in dim_datasets.iteritems():
        assert (type(dim_list)) is list
        for entry in dim_list:
            assert type(entry) is dict
            assert set(entry.keys()) \
                    == set(['vals','dim','prop_to_move','from_groups'])
            assert entry['dim'] in model_dict['strata_cols']
            assert 0 <= entry['prop_to_move'] <= 1
            assert not set(entry['from_groups']) - dataset_types

    ## assert holdout set has 2 keys (store_to_disk, score_using_full_model)
    ## and the corresponding values are boolean
    assert set(model_dict['holdout_set'].keys()) \
            == set(['store_to_disk','score_using_full_model'])
    assert len(filter(
        lambda x: type(x) is not bool, 
        model_dict['holdout_set'].values()
    )) == 0
    ## if holdout data isn't stored, it can't be scored
    assert not (model_dict['holdout_set']['store_to_disk'] is False) \
                & (model_dict['holdout_set']['score_using_full_model'] is True)

### Model Choice
* package/class name as a string
* parameters as a dictionary

In [12]:
model_dict['model'] = 'xgboost.XGBClassifier'
model_dict['model_params'] = {
    'n_jobs': 1, 
    'learning_rate': 0.1, 
    'n_estimators': 200, 
    'max_features': 'auto', 
    'booster': 'gbtree', 
    'silent': True, 
    'nthread': None, 
    'subsample': 0.9, 
    'random_state': 9, 
    'objective': 'binary:logistic', 
    'max_depth': 3, 
    'gamma': 0
}

#### Checks

In [13]:
## test that model object can be created
## from model inputs
try:
    import importlib

    model_class_str = model_dict['model']
    model_obj_path = '.'.join(model_class_str.split('.')[:-1])
    model_name = model_class_str.split('.')[-1]
    model_package = importlib.import_module(model_obj_path)
    model_class = getattr(model_package, model_name)
    _ = model_class(**model_dict['model_params'])
except Exception as e:
    e

### Write out model.json file

In [14]:
model_json_path = '{}/model.json'.format(model_dict['model_id'])
assert not os.path.exists(model_json_path)

with open(model_json_path,'w') as w:
    json.dump(model_dict, w, indent=4)

## Generate dictionary version of plots.json

In [15]:
plots_dict = {'model_id': MODEL_ID}

### Plot Labels
* labels --> names (note: keys should be strings)
* name for success rate

In [16]:
plots_dict['label_map'] = {
    '1': 'Won',
    '0': 'Lost'
}
plots_dict['success_name'] = 'Win Rate'

#### Checks

In [17]:
assert type(plots_dict['label_map']) is dict
assert type(plots_dict['success_name']) is str
assert set(plots_dict['label_map'].keys()) == set(['0','1'])

### Bins to plot
* plot_bins: 
   * Number of bins to plot (i.e. number of bars on the bar chart)
* bin_types:
   * "Bin" puts scores into uniform bins, e.g. [0, 0.10], (0.10, 0.20], ..., (0.9, 1.0]
   * "Percentile" bins scores into ntiles determined by plot_bins

In [18]:
plots_dict['bin_types'] = ['Bin', 'Percentile']
plots_dict['plot_bins'] = [10, 100]

#### Checks

In [19]:
## currently only supports "Bin" and "Percentile"
assert not set(plots_dict['bin_types']) - set(['Bin','Percentile'])
## all plot bins values should be ints
assert plots_dict['plot_bins'] == map(int, plots_dict['plot_bins'])
## ensure all bins values are in [2, 1000]
assert filter(
        lambda x: 2 <= x <= 1000, plots_dict['plot_bins']
    )   == plots_dict['plot_bins']

### Threshold Metrics to Plot
* metrics evaluated at each of 100 score threshold points
* currently only supports Accuracy and F1

In [20]:
plots_dict['threshold_metrics'] = ['Accuracy','F1']

#### Checks

In [21]:
assert type(plots_dict['threshold_metrics']) is list
## currently only supports Accuracy and F1
assert not set(plots_dict['threshold_metrics']) - set(['Accuracy','F1'])

### Write out plots.json

In [22]:
plots_json_path = '{}/plots.json'.format(model_dict['model_id'])
assert not os.path.exists(plots_json_path)

with open(plots_json_path,'w') as w:
    json.dump(plots_dict, w, indent=4)

## Execute pipeline

In [23]:
# foo

In [24]:
MODEL_ID
!source ~/.bashrc && \
    unset PYSPARK_PYTHON && \
    unset PYSPARK_DRIVER_PYTHON && \
    unset PYSPARK_DRIVER_PYTHON_OPTS && \
    cd /Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/ && \
    src/model_pipeline.sh {MODEL_ID}

'0115_test_ou'

Check JSON files

real	0m13.812s
user	0m27.367s
sys	0m2.324s

Cross-validation data

real	0m16.554s
user	0m42.666s
sys	0m2.568s

Train and score

real	0m3.419s
user	0m2.137s
sys	0m1.037s

Evaluate and plot

real	0m7.778s
user	0m6.705s
sys	0m1.280s


In [25]:
!cat /Users/joshplotkin/Dropbox/data_science/modeling-football-outcomes/models/{MODEL_ID}/logs/out

JSON configuration files passed checks.
cv sets wrote successfully.
True
successfully completed evaluation and plotting.
