In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [2]:
## from notebook creat_trainXY_dataset; modified version

def tmstats_home_away_before(tmdf, home_tm, away_tm, gameyear, nyear = 2):
    subdf_home = pd.DataFrame()
    subdf_away = pd.DataFrame()
    
    for i in range(nyear):
        want_year_str = str(gameyear-i-1)+'_'+str(gameyear-i)
        
        subdf_home = pd.concat([subdf_home, tmdf.loc[(tmstats['year'] == want_year_str) & 
                                           (tmdf['TeamAbbrevs'] == home_tm)]], 
                               ignore_index=True
                              )
        
        subdf_away = pd.concat([subdf_away, tmdf.loc[(tmstats['year'] == want_year_str) & 
                                           (tmdf['TeamAbbrevs'] == away_tm)]], 
                               ignore_index=True
                              )
        
    return subdf_home, subdf_away

def one_X_Y(game, tmstats_all, column, nyear = 2, gamma_list = [0.9]):
    """
    input gamma: should be a (n,) list
    """
    gameyear = int(game['GamePK']*1e-6)
#     print(gameyear)
    
    home_name = game['Abbrevs'][0]
    away_name = game['Abbrevs'][1]
#     print(home_name, away_name)
    
    subdf_home, subdf_away = tmstats_home_away_before(tmstats_all, 
                                                      home_name, 
                                                      away_name, 
                                                      gameyear,
                                                      nyear)
    
    if subdf_home.empty:
#         print('empty')
        home_wavg = np.array([[np.nan for i in range(len(column))] 
                              for j in range(len(gamma_list))
                             ])
    else:
        home_array = np.array([subdf_home[column].to_numpy() 
                               for j in range(len(gamma_list))])
        gamma_array = np.array([[np.power(gamma, i+1) 
                                 for i in range(home_array.shape[1])
                                ] for gamma in gamma_list
                               ])

        ## now the function only weighting the existing year, not take account those year without data
        home_wavg = [np.average(home_array[i], axis=0, weights = gamma_array[i])
                     for i in range(len(gamma_list))]
    
    if subdf_away.empty:
        away_wavg = np.array([[np.nan for i in range(len(column))] 
                              for j in range(len(gamma_list))
                             ])
    else:
        away_array = np.array([subdf_away[column].to_numpy() 
                               for j in range(len(gamma_list))])
        gamma_array = np.array([[np.power(gamma, i+1) 
                                 for i in range(away_array.shape[1])
                                ] for gamma in gamma_list
                               ])
        away_wavg = [np.average(away_array[i], axis=0, weights = gamma_array[i]) 
                     for i in range(len(gamma_list))]
        
#     if home_wavg.all() == np.nan or away_wavg.all() == np.nan:
#         print('yes empty')
        
    
    sample_X = np.concatenate((home_wavg, away_wavg), axis = 1)
    sample_Y = game['Winner']
    return sample_X, sample_Y

# Generate training X and training Y data

In [3]:
tmstats = pd.read_csv('../tmstats_allyears.csv')
tmstats['W/GP'] = tmstats['W']/tmstats['GP']
tmstats['P/GP'] = tmstats['PTS']/tmstats['GP']

#### initial setting/ hyperparameters

In [4]:
# see_column = np.array(['AvAge', 'W/GP', 'P/GP', 'W', 'PTS', 'GF/G', 'GA/G',
#                        'SRS', 'SOS', 'PP%', 'PPA'])

## now just collect all features, then select informative features.

see_column = np.array(['AvAge', 'GP', 'W', 'L', 'OL', 'PTS', 'PTS%', 'GF', 'GA',
       'SOW', 'SOL', 'SRS', 'SOS', 'GF/G', 'GA/G', 'PP', 'PPO', 'PP%', 'PPA',
       'PPOA', 'PK%', 'SH', 'SHA', 'PIM/G', 'oPIM/G', 'S', 'S%', 'SA', 'SV%',
       'SO', '5v5 TOI/GP', 'SAT%', 'Hits', 'Hits/60', 'BkS', 'BkS/60', 'GvA',
       'GvA/60', 'TkA', 'TkA/60', 'ENG', 'MsS', 'W/GP', 'P/GP'])

# see_column = np.array(['AvAge', 'GP'])
column_home = np.array([i+'_H' for i in see_column])
column_away = np.array([i+'_A' for i in see_column])

column_name = np.concatenate((column_home,column_away))
column_name = np.concatenate((column_name,['class']))

g_list = np.array([0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1])
# g_list = np.array([0.9, 0.8])
n_year = 5

In [5]:
column_name

array(['AvAge_H', 'GP_H', 'W_H', 'L_H', 'OL_H', 'PTS_H', 'PTS%_H', 'GF_H',
       'GA_H', 'SOW_H', 'SOL_H', 'SRS_H', 'SOS_H', 'GF/G_H', 'GA/G_H',
       'PP_H', 'PPO_H', 'PP%_H', 'PPA_H', 'PPOA_H', 'PK%_H', 'SH_H',
       'SHA_H', 'PIM/G_H', 'oPIM/G_H', 'S_H', 'S%_H', 'SA_H', 'SV%_H',
       'SO_H', '5v5 TOI/GP_H', 'SAT%_H', 'Hits_H', 'Hits/60_H', 'BkS_H',
       'BkS/60_H', 'GvA_H', 'GvA/60_H', 'TkA_H', 'TkA/60_H', 'ENG_H',
       'MsS_H', 'W/GP_H', 'P/GP_H', 'AvAge_A', 'GP_A', 'W_A', 'L_A',
       'OL_A', 'PTS_A', 'PTS%_A', 'GF_A', 'GA_A', 'SOW_A', 'SOL_A',
       'SRS_A', 'SOS_A', 'GF/G_A', 'GA/G_A', 'PP_A', 'PPO_A', 'PP%_A',
       'PPA_A', 'PPOA_A', 'PK%_A', 'SH_A', 'SHA_A', 'PIM/G_A', 'oPIM/G_A',
       'S_A', 'S%_A', 'SA_A', 'SV%_A', 'SO_A', '5v5 TOI/GP_A', 'SAT%_A',
       'Hits_A', 'Hits/60_A', 'BkS_A', 'BkS/60_A', 'GvA_A', 'GvA/60_A',
       'TkA_A', 'TkA/60_A', 'ENG_A', 'MsS_A', 'W/GP_A', 'P/GP_A', 'class'],
      dtype='<U12')

In [6]:
# since we only have 2009 - 2021 tmstats, if n_year = 5
# then training dataset only consider down to 2014
a = 20212022
for i in range(7):
    a = a - 10001
    print(a)

20202021
20192020
20182019
20172018
20162017
20152016
20142015


In [7]:
# Traindf = pd.DataFrame()

a = 20212022
for i in range(7):
    a = a - 10001
    filename = '../winner_home_or_away/'+str(a)+'_winner.json'
    f = open(filename)
    print(f'creating dataset {a}')
    season_data = json.load(f)
    TrainX = []
    TrainY = []
    for game in season_data:
#         n_year = 5
        sample_X, sample_Y = one_X_Y(game, tmstats, column = see_column, 
                                     nyear = n_year, gamma_list = g_list)
        TrainX.append(sample_X)
        TrainY.append(sample_Y)
    TrainX = np.array(TrainX)
    TrainY = np.array([TrainY])
    
    # now the TrainX contains all dataset with respect to each dataset
    for i, g in enumerate(g_list):
        Traindf = pd.DataFrame()
        save_file = str(a)+'_'+str(g)+'.csv'
        Xnow = TrainX[:, i, :]
        dataset = np.concatenate((Xnow, TrainY.T), axis = 1)
        
        df = pd.DataFrame(dataset, columns=column_name)
        Traindf = pd.concat([Traindf, df], ignore_index=True)
        Traindf.to_csv(save_file, index = False)
    
#         print(dataset)
    f.close()

creating dataset 20202021
creating dataset 20192020
creating dataset 20182019
creating dataset 20172018
creating dataset 20162017
creating dataset 20152016
creating dataset 20142015


In [None]:
XY_folder = "trainXY/"

for g in g_list:
    a = 20212022
    TrainXY = pd.DataFrame()
    for i in range(7):
        a = a - 10001
        filename = str(a)+'_'+str(g)+'.csv'
        dfnow = pd.read_csv(XY_folder+filename)
        
        TrainXY = pd.concat([TrainXY,dfnow], ignore_index=True)
        savename = str(20142021)+'_'+str(g)+'.csv'
        TrainXY.to_csv(savename, index = False)