In [1]:
# load all the json files

import json
from pprint import pprint
import numpy as np
import time
import glob

data = {}

# the .json files are crawled by https://github.com/FranGoitia/basketball_reference
path = [
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2003-2004/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2004-2005/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2005-2006/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2006-2007/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2007-2008/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2008-2009/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2009-2010/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2010-2011/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2011-2012/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2012-2013/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2013-2014/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2014-2015/*.json",
    "/Users/hty/Google Drive/codes/basketball_reference/matches/united_states/nba/2015-2016/*.json",
    ]

start = time.time()
for season in path:
    yearPos = season.find('20')
    year = season[yearPos : yearPos + 9] # e.g. year = '2003-2004'
    data[year] = {}
    for filename in glob.glob(season):
        with open(filename, 'r') as f:
            game = filename[-1 - 4 - 12: -1 - 4]
            data[year][game] = json.load(f)            
            
print ("time used:", time.time() - start, 'seconds')

time used: 19.892848014831543 seconds


In [3]:
# find out the names of all teams

team_names = set()
for season in (data.keys()):
    for key in (data[season].keys()):
        team_names.add(data[season][key]['home']['name'])

team_name = list(team_names)

print(team_name)

['Toronto Raptors', 'Philadelphia 76ers', 'Houston Rockets', 'Atlanta Hawks', 'Sacramento Kings', 'Chicago Bulls', 'New Orleans Hornets', 'New Orleans/Oklahoma City Hornets', 'Charlotte Hornets', 'Utah Jazz', 'New York Knicks', 'New Orleans Pelicans', 'Los Angeles Clippers', 'Portland Trail Blazers', 'Phoenix Suns', 'Minnesota Timberwolves', 'Denver Nuggets', 'San Antonio Spurs', 'Memphis Grizzlies', 'New Jersey Nets', 'Miami Heat', 'Los Angeles Lakers', 'Cleveland Cavaliers', 'Brooklyn Nets', 'Washington Wizards', 'Boston Celtics', 'Dallas Mavericks', 'Seattle SuperSonics', 'Golden State Warriors', 'Charlotte Bobcats', 'Milwaukee Bucks', 'Oklahoma City Thunder', 'Orlando Magic', 'Indiana Pacers', 'Detroit Pistons']


In [4]:
# construct features matrix (X matrix)
# this block takes a few minutes to run

import pandas as pd

X = {}

# counting wins for each team (win: True; loss: False)
win_counts = {}

num_of_game_used_to_cal_ave = 82

start = time.time()
for name in team_name:
    X[name] = pd.DataFrame()
    win_counts[name] = {}
    for season in sorted(data.keys()):
    #     sas_data[season] = {}
        tmp_df = pd.DataFrame()
        win_counts[name][season] = []
        for game_id in (data[season].keys()):
            if data[season][game_id]['home']['name'] == name:
                
                df_self = pd.DataFrame(data[season][game_id]['home']['totals'], index=[game_id])
                df_opponent = pd.DataFrame(data[season][game_id]['away']['totals'], index=[game_id])
                
                # get the difference of the stats between two teams
                tmp_df = tmp_df.append(df_self - df_opponent)
                
                win_counts[name][season].append( 
                    data[season][game_id]['home']['totals']['PTS'] > data[season][game_id]['away']['totals']['PTS']
                )
                
            elif data[season][game_id]['away']['name'] == name:

                df_self = pd.DataFrame(data[season][game_id]['away']['totals'], index=[game_id])
                df_opponent = pd.DataFrame(data[season][game_id]['home']['totals'], index=[game_id])
                
                tmp_df = tmp_df.append(df_self -df_opponent)

                win_counts[name][season].append( 
                    data[season][game_id]['home']['totals']['PTS'] < data[season][game_id]['away']['totals']['PTS']
                )
            
    
        if season != '2015-2016':
            get_ave_stat = tmp_df.mean()
        else:
            # select the first X games to calculate the average stats for the season that we are predicting
            get_ave_stat = tmp_df[0:num_of_game_used_to_cal_ave].mean()
        
        temp = get_ave_stat.to_frame(season)
        get_ave_stat = temp.transpose()
        X[name] = X[name].append(get_ave_stat)

print(f"Features: {X[team_name[0]].columns}")
print ("time used:", time.time() - start)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Features: Index(['STL%', 'FT', '2PA', 'FG', 'DRB', 'ORB%', 'AST', '3PAr', 'PF', 'FGA',
       'DRBr', '2P', 'ORBr', 'TOV%', 'AST%', 'FTAr', 'FIC', 'eFG%', 'FG%',
       '2PAr', '+/-', 'USG%', 'DRtg', '2P%', 'DRB%', 'ORtg', 'TRB%', 'ORB',
       '3P', 'TOV', 'STL/TOV', 'TSA', 'AST/TOV', '3PA', 'BLK%', 'FT%', 'PTS',
       'HOB', 'STL', 'TRB', 'FTA', 'BLK', 'FTr', 'TS%', 'FT/FGA', '3P%'],
      dtype='object')
time used: 237.86986994743347


In [5]:
# transfer data to new team names

X['Brooklyn Nets'][0:(len(path) - 4)] = X['New Jersey Nets'][0:(len(path) - 4)]
X['New Orleans Hornets'][(len(path) - 11):(len(path) - 9)] = X['New Orleans/Oklahoma City Hornets'][(len(path) - 11):(len(path) - 9)]
X['New Orleans Pelicans'][0:(len(path) - 3)] = X['New Orleans Hornets'][0:(len(path) - 3)]
X['Charlotte Hornets'][0:(len(path) - 2)] = X['Charlotte Bobcats'][0:(len(path) - 2)]
X['Oklahoma City Thunder'][0:(len(path) - 8)] = X['Seattle SuperSonics'][0:(len(path) - 8)]

In [6]:
# delete old teams

X.pop('New Jersey Nets')
X.pop('New Orleans/Oklahoma City Hornets')
X.pop('New Orleans Hornets')
X.pop('Charlotte Bobcats')
X.pop('Seattle SuperSonics')


Unnamed: 0,+/-,2P,2P%,2PA,2PAr,3P,3P%,3PA,3PAr,AST,...,STL%,STL/TOV,TOV,TOV%,TRB,TRB%,TS%,TSA,USG%,eFG%
2003-2004,-1.268293,-3.390244,-0.001535,-7.0,-0.081923,2.890244,0.025147,6.609756,0.081923,0.658537,...,0.429268,0.032219,-0.365854,-0.095122,-3.329268,-4.1,0.006176,-1.710244,0.0,0.013072
2004-2005,3.634409,-2.602151,-0.009855,-4.086022,-0.062777,2.0,0.008637,5.172043,0.062777,-2.602151,...,0.701075,0.02982,-0.139785,-0.272043,2.989247,3.770968,0.003989,1.067097,0.0,-0.001061
2005-2006,-6.04878,-2.292683,-0.031924,-0.865854,-0.02636,0.768293,-0.011175,2.268293,0.02636,-3.378049,...,0.307317,0.025207,0.231707,0.110976,-0.890244,-0.978049,-0.019874,0.592195,0.0,-0.02266
2006-2007,-5.333333,-0.950617,-0.023289,0.703704,0.000242,0.111111,0.007493,0.111111,-0.000242,-2.111111,...,0.607407,0.032294,0.148148,0.192593,-1.333333,-1.801235,-0.012408,-0.363951,0.0,-0.015805
2007-2008,-17.512195,1.597561,-0.018845,6.5,0.085912,-3.585366,-0.055416,-7.768293,-0.085912,-3.036585,...,-2.203659,-0.254663,2.646341,2.365854,0.52439,0.52561,-0.032072,-2.244878,0.0,-0.034689
2008-2009,,,,,,,,,,,...,,,,,,,,,,
2009-2010,,,,,,,,,,,...,,,,,,,,,,
2010-2011,,,,,,,,,,,...,,,,,,,,,,
2011-2012,,,,,,,,,,,...,,,,,,,,,,
2012-2013,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# construct label(Y vector)

win_rate = {} # key: team name, value: pandas data frame

for name in (win_counts.keys()):
    win_rate[name] = pd.DataFrame()
    for season in sorted(win_counts[name].keys()):
        if len(win_counts[name][season]) > 0:
            temp = pd.DataFrame(
                [((win_counts[name][season].count(True))/float(len(win_counts[name][season])))], index=[season]
            )
            win_rate[name] = win_rate[name].append(temp)


In [8]:
# combine win rates of teams with different names

win_rate['Brooklyn Nets'] = win_rate['New Jersey Nets'].append(win_rate['Brooklyn Nets'])

temp = win_rate['New Orleans Hornets'][0:2].append(win_rate['New Orleans/Oklahoma City Hornets'])
win_rate['New Orleans Hornets'] = temp.append(win_rate['New Orleans Hornets'][-6:])
win_rate['New Orleans Pelicans'] = win_rate['New Orleans Hornets'].append(win_rate['New Orleans Pelicans'])

win_rate['Charlotte Hornets'] = win_rate['Charlotte Bobcats'].append(win_rate['Charlotte Hornets'])
win_rate['Oklahoma City Thunder'] = win_rate['Seattle SuperSonics'].append(win_rate['Oklahoma City Thunder'])

In [9]:
# delete old teams

win_rate.pop('New Jersey Nets')
win_rate.pop('New Orleans Hornets')
win_rate.pop('Charlotte Bobcats')
win_rate.pop('Seattle SuperSonics')
win_rate.pop('New Orleans/Oklahoma City Hornets')

Unnamed: 0,0
2005-2006,0.463415
2006-2007,0.475


In [40]:
# training - ground work - finding the best l1 ratio


from sklearn.linear_model import ElasticNetCV

estimator = {}

num_of_train_data = len(X['San Antonio Spurs']) - 1

# find out the best l1 ratio
l1r=[]
l1RatiosToBeTested = [.1, .3, .5, .7, .9, .95, 1] # 0: ridge, 1: lasso

for team in sorted(win_rate.keys()):
    regressor = ElasticNetCV(
        l1_ratio = l1RatiosToBeTested
        , max_iter=5000
        , cv = 3
    )
    
    #     handle missing value in Charlotte Hornets
    if not team == 'Charlotte Hornets':
        regressor.fit(
            X[team][0:num_of_train_data], 
            win_rate[team][0][0:num_of_train_data])
    else:
        regressor.fit(
            X[team][1:num_of_train_data], 
            win_rate[team][0][1:num_of_train_data]) 
    
    l1r.append(regressor.l1_ratio_)
    print(f"Current team: {team:25}, l1 ratio: {regressor.l1_ratio_}, penalization: {regressor.alpha_}")


Current team: Atlanta Hawks            , l1 ratio: 1.0, penalization: 0.023343294156901322
Current team: Boston Celtics           , l1 ratio: 0.1, penalization: 0.21016362050491025
Current team: Brooklyn Nets            , l1 ratio: 0.7, penalization: 0.056820437518547186
Current team: Charlotte Hornets        , l1 ratio: 0.1, penalization: 2.8033537349440376
Current team: Chicago Bulls            , l1 ratio: 0.1, penalization: 0.5449872088239386
Current team: Cleveland Cavaliers      , l1 ratio: 0.1, penalization: 0.021134557806466697
Current team: Dallas Mavericks         , l1 ratio: 1.0, penalization: 0.024679346016331174
Current team: Denver Nuggets           , l1 ratio: 1.0, penalization: 0.011623260088912624
Current team: Detroit Pistons          , l1 ratio: 0.1, penalization: 0.2832535496127874
Current team: Golden State Warriors    , l1 ratio: 1.0, penalization: 0.0577361379744123
Current team: Houston Rockets          , l1 ratio: 1.0, penalization: 0.0568569739527746
Current te

In [11]:
# check which l1 ratio is used the most
best_count = 0
for n in l1RatiosToBeTested:
    print (n, l1r.count(n))    
    if l1r.count(n) > best_count:
        best_count = l1r.count(n)
        best_l1_ratio = n

print(f"The l1 ratio used by most teams is {best_l1_ratio}, with count being {best_count}")

0.1 10
0.3 3
0.5 0
0.7 1
0.9 0
0.95 0
1 16
The l1 ratio used by most teams is 1, with count being 16


In [27]:
# training

for team in sorted(win_rate.keys()):
    print ("Processing: ", team)

    estimator[team] = ElasticNetCV(
        l1_ratio = best_l1_ratio
        , max_iter=5000
        , cv = 3
    )
    
    if team != 'Charlotte Hornets':
        estimator[team].fit(X[team][0:num_of_train_data], win_rate[team][0][0:num_of_train_data])
    else:
        estimator[team].fit(X[team][1:num_of_train_data], win_rate[team][0][1:num_of_train_data])


Processing:  Atlanta Hawks
Processing:  Boston Celtics
Processing:  Brooklyn Nets
Processing:  Charlotte Hornets
Processing:  Chicago Bulls
Processing:  Cleveland Cavaliers
Processing:  Dallas Mavericks
Processing:  Denver Nuggets
Processing:  Detroit Pistons
Processing:  Golden State Warriors
Processing:  Houston Rockets
Processing:  Indiana Pacers
Processing:  Los Angeles Clippers
Processing:  Los Angeles Lakers
Processing:  Memphis Grizzlies
Processing:  Miami Heat
Processing:  Milwaukee Bucks
Processing:  Minnesota Timberwolves
Processing:  New Orleans Pelicans
Processing:  New York Knicks
Processing:  Oklahoma City Thunder
Processing:  Orlando Magic
Processing:  Philadelphia 76ers
Processing:  Phoenix Suns
Processing:  Portland Trail Blazers
Processing:  Sacramento Kings
Processing:  San Antonio Spurs
Processing:  Toronto Raptors
Processing:  Utah Jazz
Processing:  Washington Wizards


In [28]:
#  prediction: win rate in 15-16
actual_win_rate = {}
predicted_win_rate = {}
for team in sorted(win_rate.keys()):
    actual_win_rate[team] = (win_rate[team][0][-1])
    predicted_win_rate[team] = estimator[team].predict(X[team].iloc[-1].values.reshape(1,-1))[0]


In [29]:
# preditcted standing
from operator import itemgetter

WEST_TEAMS = {
    'Golden State Warriors', 'New Orleans Pelicans', 'San Antonio Spurs',
    'Oklahoma City Thunder', 'Los Angeles Clippers', 'Portland Trail Blazers',
    'Dallas Mavericks', 'Memphis Grizzlies', 'Houston Rockets',
    'Utah Jazz', 'Sacramento Kings', 'Denver Nuggets',
    'Phoenix Suns', 'Los Angeles Lakers', 'Minnesota Timberwolves'    
}


def get_east_and_west_predicted_rankings(overall_predicted_ranking):
    pred_ranking = pd.DataFrame(sorted(overall_predicted_ranking.items(), key=itemgetter(1), reverse=True))

    west_standing_pred = pd.DataFrame()
    east_standing_pred = pd.DataFrame()

    for i in range(len(pred_ranking[0])):    
        if pred_ranking[0][i] in WEST_TEAMS:
            west_standing_pred = west_standing_pred.append(pred_ranking.iloc[i])
        else:
            east_standing_pred = east_standing_pred.append(pred_ranking.iloc[i])
    
    return west_standing_pred, east_standing_pred


west_standing_pred, east_standing_pred = get_east_and_west_predicted_rankings(predicted_win_rate)

In [30]:
# actual standing
actual_ranking = pd.DataFrame(sorted(actual_win_rate.items(), key=itemgetter(1), reverse=True))

west_standing_actual = pd.DataFrame()
east_standing_actual = pd.DataFrame()

for i in range(len(actual_ranking[0])):
    if actual_ranking[0][i] in WEST_TEAMS:
        west_standing_actual = west_standing_actual.append(actual_ranking.iloc[i])
    else:
        east_standing_actual = east_standing_actual.append(actual_ranking.iloc[i])

In [31]:
# calculate Spwerman's Rank-Order Correlation coefficient

from scipy.stats import spearmanr

def calc_spearman(actual, prediction):
    prediction[2] = pd.Series(range(1,16), index = prediction.index)
    actual[2] = pd.Series(range(1,16), index = actual.index)

    arr_pred = prediction.sort_values(by=0)[2]
    arr_actual = actual.sort_values(by=0)[2]

    return spearmanr(arr_actual, arr_pred)

print(f'West conference: {calc_spearman(west_standing_actual, west_standing_pred)}')
print(f'East conference: {calc_spearman(east_standing_actual, east_standing_pred)}')

West conference: SpearmanrResult(correlation=0.9428571428571427, pvalue=1.4247676947645743e-07)
East conference: SpearmanrResult(correlation=0.8785714285714284, pvalue=1.6315280286544268e-05)


In [32]:
from sklearn.metrics import mean_squared_error

def calc_MSE(actual, prediction):
    return mean_squared_error(
            list(actual.values()), 
            list(prediction.values())
            )

print (f'out of sample (MSE): {calc_MSE(actual_win_rate, predicted_win_rate)}')

out of sample (MSE): 0.0027850311380836395


In [24]:

def print_ranking(actual, prediction):
    print(f'   actual                ,    prediction')
    print('-'*50)
    for actual, pred in zip(actual[0], prediction[0]):
        print(f'{actual:25}, {pred}')

print_ranking(west_standing_actual, west_standing_pred)
print_ranking(east_standing_actual, east_standing_pred)

   actual                ,    prediction
--------------------------------------------------
Golden State Warriors    , Golden State Warriors
San Antonio Spurs        , San Antonio Spurs
Oklahoma City Thunder    , Oklahoma City Thunder
Los Angeles Clippers     , Los Angeles Clippers
Portland Trail Blazers   , Utah Jazz
Dallas Mavericks         , Portland Trail Blazers
Memphis Grizzlies        , Houston Rockets
Houston Rockets          , Dallas Mavericks
Utah Jazz                , Sacramento Kings
Sacramento Kings         , Memphis Grizzlies
Denver Nuggets           , Denver Nuggets
New Orleans Pelicans     , New Orleans Pelicans
Minnesota Timberwolves   , Minnesota Timberwolves
Phoenix Suns             , Phoenix Suns
Los Angeles Lakers       , Los Angeles Lakers
   actual                ,    prediction
--------------------------------------------------
Cleveland Cavaliers      , Toronto Raptors
Toronto Raptors          , Cleveland Cavaliers
Atlanta Hawks            , Atlanta Hawks
Charl

In [None]:
# output prediction as excel file

writer2 = pd.ExcelWriter('test predicted standing use diff as stats %s %s.xlsx'%(num_of_game_used_to_cal_ave, num_of_train_data))
west_standing_pred.to_excel(writer2, sheet_name='west')
east_standing_pred.to_excel(writer2, sheet_name='east')
writer2.save()


In [None]:
# output actual as excel file

writer = pd.ExcelWriter('test actual standing.xlsx')
west_standing_actual.to_excel(writer, sheet_name='west')
east_standing_actual.to_excel(writer, sheet_name='east')
writer.save()


In [35]:
# another prediction with random forest

from sklearn.ensemble import RandomForestRegressor

estimator_rf = {}

for team in sorted(win_rate.keys()):
    print ("Processing: ", team)

    estimator_rf[team] = RandomForestRegressor()
    
    if team != 'Charlotte Hornets':
        estimator_rf[team].fit(X[team][0:num_of_train_data], win_rate[team][0][0:num_of_train_data])
    else:
        estimator_rf[team].fit(X[team][1:num_of_train_data], win_rate[team][0][1:num_of_train_data])

    
#  prediction: win rate in 15-16
predicted_win_rate_rf = {}
for team in sorted(win_rate.keys()):
    predicted_win_rate_rf[team] = estimator_rf[team].predict(X[team].iloc[-1].values.reshape(1,-1))[0]
    
west_standing_pred_rf, east_standing_pred_rf = get_east_and_west_predicted_rankings(predicted_win_rate_rf)



Processing:  Atlanta Hawks
Processing:  Boston Celtics
Processing:  Brooklyn Nets
Processing:  Charlotte Hornets
Processing:  Chicago Bulls
Processing:  Cleveland Cavaliers
Processing:  Dallas Mavericks
Processing:  Denver Nuggets
Processing:  Detroit Pistons
Processing:  Golden State Warriors
Processing:  Houston Rockets
Processing:  Indiana Pacers
Processing:  Los Angeles Clippers
Processing:  Los Angeles Lakers
Processing:  Memphis Grizzlies
Processing:  Miami Heat
Processing:  Milwaukee Bucks
Processing:  Minnesota Timberwolves
Processing:  New Orleans Pelicans
Processing:  New York Knicks
Processing:  Oklahoma City Thunder
Processing:  Orlando Magic
Processing:  Philadelphia 76ers
Processing:  Phoenix Suns
Processing:  Portland Trail Blazers




Processing:  Sacramento Kings
Processing:  San Antonio Spurs
Processing:  Toronto Raptors
Processing:  Utah Jazz
Processing:  Washington Wizards




In [36]:
print(f'West conference: {calc_spearman(west_standing_actual, west_standing_pred_rf)}')
print(f'East conference: {calc_spearman(east_standing_actual, east_standing_pred_rf)}')

print (f'out of sample (MSE): {calc_MSE(actual_win_rate, predicted_win_rate_rf)}')

print_ranking(west_standing_actual, west_standing_pred_rf)
print_ranking(east_standing_actual, east_standing_pred_rf)

West conference: SpearmanrResult(correlation=0.9178571428571428, pvalue=1.4176968515006258e-06)
East conference: SpearmanrResult(correlation=0.8107142857142855, pvalue=0.0002460079779722063)
out of sample (MSE): 0.0064140612691833436
   actual                ,    prediction
--------------------------------------------------
Golden State Warriors    , Golden State Warriors
San Antonio Spurs        , San Antonio Spurs
Oklahoma City Thunder    , Oklahoma City Thunder
Los Angeles Clippers     , Los Angeles Clippers
Portland Trail Blazers   , Dallas Mavericks
Dallas Mavericks         , Portland Trail Blazers
Memphis Grizzlies        , Utah Jazz
Houston Rockets          , Memphis Grizzlies
Utah Jazz                , Houston Rockets
Sacramento Kings         , Denver Nuggets
Denver Nuggets           , Los Angeles Lakers
New Orleans Pelicans     , Phoenix Suns
Minnesota Timberwolves   , Sacramento Kings
Phoenix Suns             , New Orleans Pelicans
Los Angeles Lakers       , Minnesota Timberw