In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import json

from nhlpy.constants import BASE_URL
from nhlpy import team,game,schedule #There are other modules but this should do it for now

## create a function that can return a ndarray that contains some of the informations (see_column) of last n_year years, and this ndarray will be the sample_X, sample_Y will the home win or not.

In [2]:
f = open('../winner_home_or_away/20202021_winner.json')

In [3]:
season_data = json.load(f)
season_data[0]

{'Teams': ['Philadelphia Flyers', 'Pittsburgh Penguins'],
 'Abbrevs': ['PHI', 'PIT'],
 'GamePK': 2020020001,
 'Winner': 0}

In [4]:
tmstats = pd.read_csv('../tmstats_allyears.csv')
tmstats['W/GP'] = tmstats['W']/tmstats['GP']
tmstats['P/GP'] = tmstats['PTS']/tmstats['GP']
# column_names = tmstats.columns

In [5]:
## for example, looking into these features (see_colmun) 
# tmstats[see_column].dtypes all data type are float and int

see_column = np.array(['AvAge', 'W/GP', 'P/GP', 'W', 'PTS', 'GF/G', 'GA/G',
                       'SRS', 'SOS', 'PP%', 'PPA'])

In [6]:
tmstats.sample(1)

Unnamed: 0,Rk,Team,AvAge,GP,W,L,OL,PTS,PTS%,GF,...,GvA,GvA/60,TkA,TkA/60,ENG,MsS,year,TeamAbbrevs,W/GP,P/GP
442,23,Buffalo Sabres,26.2,82,35,36,11,81,0.494,199,...,488,5.86,431.0,5.18,13.0,866,2015_2016,BUF,0.426829,0.987805


In [7]:
def tmstats_home_away_before(tmdf, home_tm, away_tm, gameyear, nyear = 2):
    subdf_home = pd.DataFrame()
    subdf_away = pd.DataFrame()
    
    for i in range(nyear):
        want_year_str = str(gameyear-i-1)+'_'+str(gameyear-i)
        
        subdf_home = pd.concat([subdf_home, tmdf.loc[(tmstats['year'] == want_year_str) & 
                                           (tmdf['TeamAbbrevs'] == home_tm)]], 
                               ignore_index=True
                              )
        
        subdf_away = pd.concat([subdf_away, tmdf.loc[(tmstats['year'] == want_year_str) & 
                                           (tmdf['TeamAbbrevs'] == away_tm)]], 
                               ignore_index=True
                              )
        
    return subdf_home, subdf_away

In [8]:
def one_X_Y(game, tmstats_all, column, nyear = 2, gamma = 0.9):
    gameyear = int(game['GamePK']*1e-6)
#     print(gameyear)
    
    home_name = game['Abbrevs'][0]
    away_name = game['Abbrevs'][1]
#     print(home_name, away_name)
    
    subdf_home, subdf_away = tmstats_home_away_before(tmstats_all, 
                                                      home_name, 
                                                      away_name, 
                                                      gameyear,
                                                      nyear)
    
    if subdf_home.empty:
#         print('empty')
        home_wavg = np.array([np.nan for i in range(len(column))])
#         print(home_wavg)
    else:
        home_array = subdf_home[column].to_numpy()
        gamma_array = [np.power(gamma, i+1) for i in range(home_array.shape[0])]
        ## now the function only weighting the existing year, not take account those year without data
        home_wavg = np.average(home_array, axis=0, weights = gamma_array)
    
    if subdf_away.empty:
#         print('empty')
        away_wavg = np.array([np.nan for i in range(len(column))])
    else:
        away_array = subdf_away[column].to_numpy()
        gamma_array = [np.power(gamma, i+1) for i in range(away_array.shape[0])]
        away_wavg = np.average(away_array, axis=0, weights = gamma_array)
        
        
        
#     if home_wavg.all() == np.nan or away_wavg.all() == np.nan:
#         print('yes empty')
        
    
    sample_X = np.concatenate((home_wavg, away_wavg))
    sample_Y = game['Winner']
    return sample_X, sample_Y

In [9]:
TrainX = []
TrainY = []
for game in season_data:
    n_year = 5
    sample_X, sample_Y = one_X_Y(game, tmstats, column = see_column, 
                                 nyear = n_year, gamma = 0.9)
    TrainX.append(sample_X)
    TrainY.append(sample_Y)

In [10]:
TrainY = np.array([TrainY])

In [11]:
column_name = np.array(['AvAge_H', 'W/GP_H', 'P/GP_H', 'W_H', 'PTS_H', 'GF/G_H', 'GA/G_H',
                        'SRS_H', 'SOS_H', 'PP%_H', 'PPA_H', 
                        'AvAge_A', 'W/GP_A', 'P/GP_A', 'W_A', 'PTS_A', 'GF/G_A', 'GA/G_A',
                        'SRS_A', 'SOS_A', 'PP%_A', 'PPA_A', 
                        'Class'])

In [12]:
dataset = np.concatenate((TrainX, TrainY.T), axis = 1)

df = pd.DataFrame(dataset, columns=column_name)
df.sample(10)

Unnamed: 0,AvAge_H,W/GP_H,P/GP_H,W_H,PTS_H,GF/G_H,GA/G_H,SRS_H,SOS_H,PP%_H,...,P/GP_A,W_A,PTS_A,GF/G_A,GA/G_A,SRS_A,SOS_A,PP%_A,PPA_A,Class
362,29.29328,0.522743,1.138091,41.6509,90.686137,2.934888,2.860182,0.06017,-0.0197,20.120311,...,1.220925,43.915129,95.199262,3.136052,2.827085,0.296015,-0.023653,20.084391,46.214022,1.0
920,28.41757,0.515695,1.155773,40.527338,90.751386,2.867473,2.82752,0.042619,0.009609,17.671216,...,1.293739,45.965227,101.900393,3.089116,2.560526,0.48331,-0.024132,23.679249,41.4385,0.0
397,27.239857,0.51057,1.144919,40.359674,90.492515,3.163445,2.972827,0.155716,-0.011701,22.004255,...,1.156746,42.37833,91.826524,3.063319,2.876867,0.199171,-0.003416,20.680457,51.651999,0.0
849,28.591815,0.546797,1.217926,43.227052,96.281312,2.965264,2.712383,0.25696,-0.004224,17.768794,...,1.11333,38.525921,87.220727,2.81546,2.82913,0.000981,0.004648,18.86154,38.522429,0.0
804,26.847183,0.493127,1.11333,38.525921,87.220727,2.81546,2.82913,0.000981,0.004648,18.86154,...,1.097511,38.684452,86.981832,2.969807,3.005417,-0.010346,0.017328,18.143575,48.309834,0.0
472,29.29328,0.522743,1.138091,41.6509,90.686137,2.934888,2.860182,0.06017,-0.0197,20.120311,...,1.239367,44.888501,98.071769,2.893054,2.632173,0.271211,0.005455,20.853433,42.692511,1.0
530,28.240141,0.575194,1.257768,45.325584,99.180337,3.231946,2.814201,0.409382,-0.014816,22.486996,...,1.155773,40.527338,90.751386,2.867473,2.82752,0.042619,0.009609,17.671216,43.32241,0.0
264,28.196576,0.566799,1.239367,44.888501,98.071769,2.893054,2.632173,0.271211,0.005455,20.853433,...,1.035943,37.428585,82.268125,2.591869,2.722259,-0.15181,-0.003132,18.298652,45.338404,1.0
935,28.841328,0.562741,1.220925,43.915129,95.199262,3.136052,2.827085,0.296015,-0.023653,20.084391,...,1.056579,37.431272,83.95331,2.778839,2.897051,-0.102295,0.000662,17.541592,50.285365,0.0
939,28.178555,0.470782,1.056579,37.431272,83.95331,2.778839,2.897051,-0.102295,0.000662,17.541592,...,1.220925,43.915129,95.199262,3.136052,2.827085,0.296015,-0.023653,20.084391,46.214022,0.0


In [13]:
df.to_csv('20202021_traing.csv')