In [None]:
X_train, X_test, y_train, y_test = cv.train_test_split(X, y, test_size=0.25, random_state=1)

# **GRADIENT BOOSTING**

In [None]:
model = GradientBoostingClassifier(learning_rate=0.01, 
                                   n_estimators=1000, 
                                   min_samples_leaf=10)
model.fit(X_train, y_train)

In [None]:
model.predict_proba(X_train)

In [None]:
model.score(X_test, y_test)

In [None]:
names = ['team_id', 'HoA', 'goals', 'shots', 'hits', 'pim', 'powerPlayOpportunities', 'powerPlayGoals', 'faceOffWinPercentage', 'giveaways', 'takeaways', 'streak', 'time_travel']
feature_importances = 100*model.feature_importances_ / np.sum(model.feature_importances_)
feature_importances, feature_names, feature_idxs = \
    zip(*sorted(zip(feature_importances, names, range(len(names)))))

width = 0.8

idx = np.arange(len(names))
plt.barh(idx, feature_importances, align='center')
plt.yticks(idx, feature_names)

plt.title("Feature Importances in Gradient Booster")
plt.xlabel('Relative Importance of Feature', fontsize=14)
plt.ylabel('Feature Name', fontsize=14)

In [None]:
plot_partial_dependence(model, X_train, names,
                        n_jobs=3, grid_resolution=100)
fig = plt.gcf()
# fig.suptitle('Partial dependence of house value on non-location features\n'
#              'for the California housing dataset, with MLPRegressor')
fig.subplots_adjust(hspace=.1, wspace=.3)

In [None]:
X_nogoals = df.drop(columns=['game_id', 'goals', 'won', 'settled_in', 'head_coach', 'venue_time_zone_offset', 'prev_time'])
X_train2, X_test2, y_train2, y_test2 = cv.train_test_split(X_nogoals, y, test_size=0.25, random_state=1)

In [None]:
model = GradientBoostingClassifier(learning_rate=0.02, 
                                   n_estimators=1000, 
                                   min_samples_leaf=10)
model.fit(X_train2, y_train2)
model.score(X_test2, y_test2)

In [None]:
names = ['team_id', 'HoA', 'shots', 'hits', 'pim', 'powerPlayOpportunities', 'faceOffWinPercentage', 'giveaways', 'takeaways', 'streak', 'time_travel']
feature_importances = 100*model.feature_importances_ / np.sum(model.feature_importances_)
feature_importances, feature_names, feature_idxs = zip(*sorted(zip(feature_importances, names, range(len(names)))))

width = 0.8

idx = np.arange(len(names))
plt.barh(idx, feature_importances, align='center')
plt.yticks(idx, feature_names)

plt.title("Feature Importances in Gradient Booster")
plt.xlabel('Relative Importance of Feature', fontsize=14)
plt.ylabel('Feature Name', fontsize=14)

In [None]:
season_20122013 = season_df[season_df.season == 20122013]
season = 20122013

def seasons(df, season):
    
    filtered = season_team[season_team.season == season]

    # now calc cumulative stats
    filtered['gpg'] = (filtered.groupby('team_id')['goals'].cumsum() - filtered['goals']) / filtered.groupby('team_id').cumcount()
    filtered['win_per'] = (filtered.groupby('team_id')['won'].cumsum() - filtered['won']) / filtered.groupby('team_id').cumcount()
    filtered['shots_pg'] = (filtered.groupby('team_id')['shots'].cumsum() - filtered['shots']) / filtered.groupby('team_id').cumcount()
    filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
    filtered['pim_pg'] = (filtered.groupby('team_id')['pim'].cumsum() - filtered['pim']) / filtered.groupby('team_id').cumcount()
    filtered['hits_pg'] = (filtered.groupby('team_id')['hits'].cumsum() - filtered['hits']) / filtered.groupby('team_id').cumcount()
    filtered['fo_per'] = (filtered.groupby('team_id')['faceOffWinPercentage'].cumsum() - filtered['faceOffWinPercentage']) / filtered.groupby('team_id').cumcount()
    filtered['giveaways_pg'] = (filtered.groupby('team_id')['giveaways'].cumsum() - filtered['giveaways']) / filtered.groupby('team_id').cumcount()
    filtered['takeaways_pg'] = (filtered.groupby('team_id')['takeaways'].cumsum() - filtered['takeaways']) / filtered.groupby('team_id').cumcount()
    filtered['ppg_per'] = ((filtered.groupby('team_id')['powerPlayGoals'].cumsum() - filtered['powerPlayGoals'])) / ((filtered.groupby('team_id')['powerPlayOpportunities'].cumsum() - filtered['powerPlayOpportunities']))

    return filtered

def filter_home_away(df_filtered, df):
    
    # filter home team stats
    filtered_h = df_filtered[df_filtered.HoA == 'home']
    filtered_h = filtered_h.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
    filtered_h = filtered_h.rename(columns={'streak': 'home_streak', 'time_travel': 'home_time_travel', 'gpg': 'home_gpg', "win_per": "home_win_per", "ppg_per": "home_ppg_per", 'shots_pg': 'home_shots_pg', 'hits_pg': 'home_hits_pg', 'pim_pg': 'home_pim_pg', 'fo_per': 'home_fo_per', 'giveaways_pg': 'home_giveaways_pg',
       'takeaways_pg': 'home_takeaways_pg'})
    # merge home team stats
    df = df.merge(filtered_h, on='game_id', how='left')

    # filter away team stats
    filtered_a = df_filtered[df_filtered.HoA == 'away']
    filtered_a = filtered_a.filter(items=['game_id', 'season', 'streak', 'time_travel', 'gpg', 'win_per', 'ppg_per', 'shots_pg', 'hits_pg', 'pim_pg', 'fo_per', 'giveaways_pg', 'takeaways_pg'])
    filtered_a = filtered_a.rename(columns={'streak': 'away_streak', 'time_travel': 'away_time_travel', 'gpg': 'away_gpg', "win_per": "away_win_per", "ppg_per": "away_ppg_per", 'shots_pg': 'away_shots_pg', 'hits_pg': 'away_hits_pg', 'pim_pg': 'away_pim_pg', 'fo_per': 'away_fo_per', 'giveaways_pg': 'away_giveaways_pg',
       'takeaways_pg': 'away_takeaways_pg'})

    # merge away team stats
    df = df.merge(filtered_a, on='game_id', how='left')
    
    return df

def replace_bools_fillna(df):
    
    df = df.drop(columns=['season_x', 'season_y', 'game_id', 'type', 'date_time', 'date_time_GMT'])
    df = df.replace({'home_streak': False}, 0)
    df = df.replace({'home_streak': True}, 1)
    df = df.replace({'away_streak': False}, 0)
    df = df.replace({'away_streak': True}, 1)
    df = df.replace({'outcome': 'loss'}, 0)
    df = df.replace({'outcome': 'win'}, 1)
    df = df.fillna(0)
    
    return df

In [None]:
season_20122013 = season_20122013.drop(labels=['season'], axis=1)

In [None]:
### replace_bools_fillna(filter_home_away(seasons(season_20122013_test, season), season_20122013_test))