In [1]:
%run ../config/initialize.ipynb

import cPickle as pickle
import json

from pyspark.sql import Window

In [3]:
rcParams['figure.dpi'] = 96
rcParams['figure.figsize'] = (12,8)

In [4]:
sys.path.append('../modeling-football-outcomes/model_pipeline')

## Generate quick and dirty features/labels sets for testing purposes 

In [22]:
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 00_plan_and_ideas.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 01_time_date.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 02_teams.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 03_matchup.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 04_travel.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 05_homeaway.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 06_weather.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 07_line.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run 08_rankings.ipynb
# os.chdir('../modeling-football-outcomes/05_feature_engineering/')
# %run combine_features.ipynb

### Modeling ideas
* different labels (regression?)
* hyperparam tuning
* model selection
* feature/team combinations, e.g. H-A DVOA
* narrow down to certain weeks

In [23]:
## SPREAD
ranks = spark.table('dvoa').select(
        'team_id','season','week_id','dave_or_wtddvoa',
        'offensedvoa','defensedvoa','s_t_dvoa'
    ).cache()
game_feats = spark.table('labels.team_game_line_labels').select(
        'game_id','season','week_id','team_id','is_home'
    ).cache()

features = game_feats.join(
        ranks, on=['team_id','season','week_id']
    ).fillna(-99999)
labels = spark.table('labels.team_game_line_labels').select(
    ## index
    'game_id','team_id',
    ## strata
    'is_home','is_fav_sbr',
    ## labels
    'did_win','final_margin','did_cover_pfr',
    'did_cover_sbr','did_cover_sbr_open'
)

assert features.count() == labels.count()

features.write.mode('overwrite').saveAsTable('features.190320_test')
labels.write.mode('overwrite').saveAsTable('labels.190320_test')

## OVER/UNDER
features = spark.table('labels.over_under_labels').select(
        'game_id','season','week_id', 'sbr_ou'
    ).fillna(-99999).cache()

labels = spark.table('labels.over_under_labels').select(
    ## index
    'game_id',
    ## strata
    ## label
    'is_sbr_ou_over'
)

assert features.count() == labels.count()

features.write.mode('overwrite').saveAsTable('features.190320_ou_test')
labels.write.mode('overwrite').saveAsTable('labels.190320_ou_test')

## Parameters dictionaries

### Won/Lost

In [None]:
model_dict = {
    'model_id': '0111_did_win_initial',
    'features_tbl': 'features.190320_test',
    'labels_tbl': 'labels.190320_test',
    'features_list': [
        'season',
        'week_id',
        'is_home',
        'dave_or_wtddvoa',
        'offensedvoa',
        'defensedvoa',
        's_t_dvoa'
    ],
    'label_col': 'did_win',
    'pos_labels': [1],
    'neg_labels': [-1],
    'index': ['game_id','team_id'],
    'kfolds': 5,
    'kfold_seed': 99,
    'dataset_seed': 9,
    'strata_cols': ['did_win','is_home'],
    'global_dataset_proportions': {
        'holdout': 0.5,
        'throw_away': 0,
        'in_training': 0.5,
        'scoring_only': 0
    },
    'dimensional_dataset_proportions': {
    'throw_away': [{
        'dim': 'is_home',
        'from_groups': ['in_training','holdout','scoring_only'],
        'vals': [0],
        'prop_to_move': 1.
    }]
    },
    'model': 'xgboost.XGBClassifier',
    'model_params': {
        'booster':'gbtree',
        'gamma': 0,
        'learning_rate': 0.1,
        'max_depth': 3,
        'max_features': 'auto',
        'n_estimators': 100,
        'n_jobs': 1,
        'nthread': None,
        'objective': 'binary:logistic',
        'random_state': 9,
        'silent': True,
        'subsample': 0.9
    }    
}

### Over/Under

In [17]:
# model_dict = {
#     'model_id': '0111_ou_initial',
#     'features_tbl': 'features.0111_ou_test',
#     'labels_tbl': 'labels.0111_ou_test',
#     'features_list': ['season','week_id', 'sbr_ou'],
#     'label_col': 'is_sbr_ou_over',
#     'pos_labels': [1],
#     'neg_labels': [-1],
#     'index': ['game_id'],
#     'kfolds': 5,
#     'kfold_seed': 99,
#     'dataset_seed': 9,
#     'strata_cols': [],
#     'global_dataset_proportions': {
#         'holdout': 0.5,
#         'throw_away': 0,
#         'in_training': 0.5,
#         'scoring_only': 0
#     },
#     'dimensional_dataset_proportions': {},
#     'model': 'xgboost.XGBClassifier',
#     'model_params': {
#         'booster':'gbtree',
#         'gamma': 0,
#         'learning_rate': 0.1,
#         'max_depth': 3,
#         'max_features': 'auto',
#         'n_estimators': 200,
#         'n_jobs': 1,
#         'nthread': None,
#         'objective': 'binary:logistic',
#         'random_state': 9,
#         'silent': True,
#         'subsample': 0.9
#     }    
# }

In [None]:
assert sum(model_dict['global_dataset_proportions'].values()) == 1
assert type(model_dict['global_dataset_proportions']) is dict
for d in model_dict['dimensional_dataset_proportions'].values():
    assert type(d) is list
    assert sum([type(x) is not dict for x in d]) == 0
    
for k in ['features_list', 'pos_labels', 
          'neg_labels', 'index', 'strata_cols']:
    assert type(model_dict[k]) is list
    
for k in ['global_dataset_proportions',
          'dimensional_dataset_proportions',
          'model_params']:
    assert type(model_dict[k]) is dict

assert set(model_dict['global_dataset_proportions'].keys()) \
        == set(['holdout','throw_away','in_training','scoring_only'])

assert sum(model_dict['global_dataset_proportions'].values()) == 1

label_cols = set(spark.table(model_dict['labels_tbl']).columns)
assert not set(model_dict['index']) - label_cols 
assert not set(model_dict['strata_cols']) - label_cols
assert not set([model_dict['label_col']]) - label_cols
      
feats = set(spark.table(model_dict['features_tbl']).columns)
assert not set(model_dict['features_list']) - feats
assert not set(model_dict['index']) - feats


## other assertions
# tables exists
# columns are in tables
# 

## CV

In [None]:
cv_data = get_cv_data(model_dict)
global_rolling = prop_dict_rolling(
    model_dict['global_dataset_proportions']
)
datasets = assign_group(
    model_dict, cv_data, global_rolling, 
    model_dict['strata_cols'], 'dataset'
)
datasets = modify_group_for_dim(
    model_dict, datasets, model_dict['dimensional_dataset_proportions'], 'dataset' 
)

## assert (1) training set is not empty 
## (2) either k-fold or scoring set is not empty
assert datasets.filter(col('dataset') == 'in_training').count() > 0
if model_dict['kfolds'] <= 1:
    assert datasets.filter(col('dataset') == 'scoring_only').count() > 0

scoring_rows = datasets.filter(col('dataset') == 'scoring_only')
training_rows = datasets.filter(col('dataset') == 'in_training')

training_rows = assign_k_folds(model_dict, training_rows)
training, scoring_only = get_training_scoring_sets(model_dict, training_rows)

## Train/Score

In [None]:
## as opposed to spark:
model_obj, train_in_memory = get_model_obj(model_dict)
## if train_in_memory is False --> spark
if train_in_memory is True:
    training_scoring_dict = cv_train(model_dict, training, 
                                                scoring_only, model_obj)
    
    scores_df = cv_score(model_dict, training_scoring_dict)

## Plot

#### plots.json

In [None]:
plots_dict = {
   'plot_bins': [10, 100],
   'label_map': {
        '0': 'Lost',
        '1': 'Won'
    },
    'bin_types': ['Bin','Percentile'],
    'eval_dict': {
    'success_name': 'Win Rate',
    'success_col': 'Won',
    'failure_col': 'Lost'
    }
}

#### Plot metrics by score threshold

In [None]:
plot_by_threshold(scores_df, 'Accuracy')
plot_by_threshold(scores_df, 'F1')

#### Ridge plots

In [None]:
ridge_plot(plots_dict, scores_df)

#### Plot distributions

In [None]:
binned_data = {}
for bin_type in plots_dict['bin_types']:
    binned_data[bin_type] = {}
    for nbins in plots_dict['plot_bins']:
        ## plot bins
        curr_bins = compute_bins(plots_dict, scores_df, nbins, bin_type)
        compute_plot_bins(plots_dict, curr_bins, nbins, bin_type, colors)
        ## plot trend
        if bin_type == 'Percentile':
            plot_trend(plots_dict, curr_bins, bin_type, nbins)
        ## store data
        binned_data[bin_type][nbins] = curr_bins

#### Plot distributions

In [None]:
#### ROC
roc_sets = {
    set_nbr: set_data['score'][['label','score']]
    for set_nbr, set_data in training_scoring_dict.iteritems()
    if (set_data['score'].shape[0] > 0) 
       & (set_nbr in training_scoring_dict.keys())
}

roc_plot_kfold_errband(roc_sets)

#### Feature importance

In [None]:
os.chdir('../modeling-football-outcomes/models/0115_test_ou/')

In [None]:
importance = get_feat_importance_df()
importance.to_csv('stats/reported/importance_agg.csv')
plot_feature_importance(importance)