In [1]:
import os
import sys
import time
import math

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)

### Load data

In [2]:
start = time.time()
seasons_df_list = []
# Loop through season starting years 2002-2016
for season_start_year in range(2002, 2017):
    season_end_year = season_start_year + 1
    file_name = "{}-{}.csv".format(season_start_year, season_end_year)
    file_path = os.path.join(
        os.path.abspath("./data"),
        file_name
    )
    season_df = pd.read_csv(file_path)
    # Add season number to df, season number equal to season start year
    season_df['SEASON_START_YEAR'] = season_start_year
    # Add unique game ID string
    season_df['GAME_ID'] = season_df.GAME_NUM.apply(lambda x: "{}_{}".format(season_start_year, x))
    seasons_df_list.append(season_df)
    
seasons_df = pd.concat(seasons_df_list)
end = time.time()
print("Loaded data in {} minutes".format((end - start) / 60.))

Loaded data in 10.854057530562082 minutes


### Add final score difference data

Adding a final score difference outcome variable to the event data in addition to pre-existing binary win/loss outcome variable. Positive in favor of home team, negative in favor of away team.

In [3]:
# Group by game
grouped_by_game = seasons_df.groupby('GAME_ID')

# Get final events from each game
final_events = grouped_by_game.tail(1)

# Get final scores
final_scores = final_events[['GAME_ID', 'HT_SCORE_DIFF']]
final_scores.columns = ['GAME_ID', 'OUTCOME_HT_SCORE_DIFF']

# Left join seasons_df and final_scores on GAME_ID
seasons_df = seasons_df.merge(final_scores, on = "GAME_ID", how = "left")

### Split Dataframe

Split seasons into two groups:
1.) Training and (internal) testing seasons; 2002-2003 through 2013-2014 seasons
2.) (External) testing/evaluation seasons; 2014-2015 through 2016-2017 seasons

In [4]:
# Create flags for seasons in event data
internal_seasons = np.arange(2002, 2014)
external_seasons = np.arange(2014, 2017)
seasons_array = seasons_df.SEASON_START_YEAR.values
internal_flag = np.isin(seasons_array, internal_seasons)
external_flag = np.isin(seasons_array, external_seasons)

# Split into internal and external groups
internal_seasons_df = seasons_df.loc[internal_flag, ]
external_seasons_df = seasons_df.loc[external_flag, ]

# TO DO

### Create Train/Test/Evaluation sets in internal seasons

80/15/5 train/test/evlaution split by game

NOTE: Make sure splits are in random game order
NOTE: Make sure final arrays have right type

In [None]:
# Count unique games in internal set
internal_games = internal_seasons_df.GAME_ID.unique()
num_internal_games = len(internal_games)

# Create random state and sample without replacement 
random_state = np.random.RandomState(seed = 42)
internal_games_sample = random_state.choice(internal_games, size = num_internal_games, replace = False)

# calculate train/test/eval split sizes
num_train_games = math.ceil(num_internal_games * .80)
num_test_games = math.ceil(num_internal_games * .15)
num_eval_games = math.floor(num_internal_games * .05)

# Define splits on random sample
internal_games_train = internal_games_sample[:num_train_games]
internal_games_test = internal_games_sample[num_train_games:(num_train_games + num_test_games)]
internal_games_eval = internal_games_sample[(num_train_games + num_test_games):]

# Check to see splits are distinct
np.intersect1d(internal_games_train, internal_games_test)
np.intersect1d(internal_games_train, internal_games_eval)
np.intersect1d(internal_games_test, internal_games_eval)

# Create flags for splits by game in event data 
internal_games_array = internal_seasons_df.GAME_ID.values
internal_train_flag = np.isin(internal_games_array, internal_games_train)
internal_test_flag = np.isin(internal_games_array, internal_games_test)
internal_eval_flag = np.isin(internal_games_array, internal_games_eval)

# Subset internal seasons df by train/test/eval splits 
train = internal_seasons_df.loc[internal_train_flag, ].as_matrix().astype(np.float32)
test_internal = internal_seasons_df.loc[internal_test_flag, ].as_matrix().astype(np.float32)
eval_internal = internal_seasons_df.loc[internal_eval_flag, ].as_matrix().astype(np.float32)

### Create Test set from external seasons

TODO: Make sure test set is in random game order


In [None]:
external_games = external_seasons_df.GAME_ID.unique()
num_external_games = len(external_games)
external_games_sample = random_state.choice(external_games, size = num_external_games, replace = False)
train.head()

### Split train/test/eval sets into X and Y

NOTE: Original and Scaled X's
NOTE: Binary Y's and Continuous Y's

### Save arrays to numpy archive