In [1]:
import pandas as pd

In this notebook, we are analyzing whether there is any model drift throughout the seasons. Does the model accuracy change due to the evolution of the game? If yes, we should look into training the model with 'Season' as an input, or just remove older games from the training set all together. 

In [2]:
results = pd.read_csv('binary_test_results')

In [3]:
results

Unnamed: 0,GameID,Linreg Prediction,GB Prediction,RF Prediction,NN Predictions,Result,Stacked Prediction
0,2014-12-22 00:00:00 Clippers @ San Antonio,1,0,1,0,0,1.0
1,2016-03-16 00:00:00 New York @ Golden State,1,0,1,0,0,1.0
2,2018-11-06 00:00:00 Atlanta @ Charlotte,0,0,0,0,1,0.0
3,2019-02-02 00:00:00 Chicago @ Charlotte,0,0,0,0,0,0.0
4,2019-02-05 00:00:00 Detroit @ New York,1,1,1,1,1,1.0
...,...,...,...,...,...,...,...
2856,2015-03-07 00:00:00 Portland Trail @ Minnesota,1,1,1,1,0,1.0
2857,2015-04-13 00:00:00 Detroit @ Cleveland,1,1,1,0,0,1.0
2858,2018-02-04 00:00:00 Milwaukee @ Brooklyn,0,1,1,0,1,1.0
2859,2015-02-22 00:00:00 Denver @ Oklahoma City,0,0,0,0,0,0.0


In [4]:
results['GameID'][0][5:7]

'12'

In [5]:
year = []
month = []
for x in range(len(results)):
    y = results['GameID'][x][0:4]
    m = results['GameID'][x][5:7]
    year.append(y)
    month.append(m)

In [6]:
len(month)

2861

In [7]:
results['year'] = year

In [8]:
results['month'] = month

In [9]:
results

Unnamed: 0,GameID,Linreg Prediction,GB Prediction,RF Prediction,NN Predictions,Result,Stacked Prediction,year,month
0,2014-12-22 00:00:00 Clippers @ San Antonio,1,0,1,0,0,1.0,2014,12
1,2016-03-16 00:00:00 New York @ Golden State,1,0,1,0,0,1.0,2016,03
2,2018-11-06 00:00:00 Atlanta @ Charlotte,0,0,0,0,1,0.0,2018,11
3,2019-02-02 00:00:00 Chicago @ Charlotte,0,0,0,0,0,0.0,2019,02
4,2019-02-05 00:00:00 Detroit @ New York,1,1,1,1,1,1.0,2019,02
...,...,...,...,...,...,...,...,...,...
2856,2015-03-07 00:00:00 Portland Trail @ Minnesota,1,1,1,1,0,1.0,2015,03
2857,2015-04-13 00:00:00 Detroit @ Cleveland,1,1,1,0,0,1.0,2015,04
2858,2018-02-04 00:00:00 Milwaukee @ Brooklyn,0,1,1,0,1,1.0,2018,02
2859,2015-02-22 00:00:00 Denver @ Oklahoma City,0,0,0,0,0,0.0,2015,02


In [10]:
results.drop(columns=['GameID', 'Stacked Prediction'], inplace=True)

In [11]:
results.head()

Unnamed: 0,Linreg Prediction,GB Prediction,RF Prediction,NN Predictions,Result,year,month
0,1,0,1,0,0,2014,12
1,1,0,1,0,0,2016,3
2,0,0,0,0,1,2018,11
3,0,0,0,0,0,2019,2
4,1,1,1,1,1,2019,2


In [12]:
results.rename(columns={'NN Predictions': 'NN Prediction'}, inplace=True)

In [13]:
models = ['Linreg', 'GB', 'RF', 'NN']

In [14]:
for model in models:
    model_results = []
    for x in range(len(results)):
        if results['{} Prediction'.format(model)][x] == results['Result'][x]:
            model_results.append('W')
        else:
            model_results.append('L')
    
    results['{} Results'.format(model)] = model_results

In [15]:
results['year'] = results['year'].astype(int)

In [16]:
results['month'] = results['month'].astype(int)

In [17]:
def year_analysis(model, results):
    year_games = []
    year_wins = []
    for year in range(2011, 2022):
        games = len(results[results['year']==year])
        wins = len(results[(results['year']==year) & (results['{} Results'.format(model)]=='W')])
        year_games.append(games)
        year_wins.append(wins)
    
    year_win_rate = []
    for x in range(len(year_games)):
        win_rate = year_wins[x]/year_games[x]
        year_win_rate.append(win_rate)
        
    print(year_win_rate)

In [18]:
for model in models:
    year_analysis(model, results)

[0.8333333333333334, 0.6377245508982036, 0.639871382636656, 0.625, 0.6109324758842444, 0.6006600660066007, 0.631578947368421, 0.5993377483443708, 0.6237942122186495, 0.672, 0.6134453781512605]
[0.6111111111111112, 0.6437125748502994, 0.6334405144694534, 0.631578947368421, 0.5916398713826366, 0.6006600660066007, 0.6085526315789473, 0.5993377483443708, 0.6270096463022508, 0.648, 0.6134453781512605]
[0.6111111111111112, 0.6287425149700598, 0.6141479099678456, 0.6414473684210527, 0.6463022508038585, 0.6270627062706271, 0.6348684210526315, 0.5927152317880795, 0.6109324758842444, 0.632, 0.6176470588235294]
[0.8333333333333334, 0.6287425149700598, 0.6430868167202572, 0.5953947368421053, 0.6141479099678456, 0.6237623762376238, 0.5855263157894737, 0.6192052980132451, 0.6430868167202572, 0.616, 0.6176470588235294]


It doesn't look like the model gets any less accurate throughout the years. Now we will investigate whether the model gets more accurate as the season goes on. The expectation is that it does, as avg stats stabalize and give better representations of teams. Keep in mind, the season usually starts in October, and ends in May.

In [19]:
def month_analysis(model, results):
    month_games = []
    month_wins = []
    for month in range(1, 13):
        games = len(results[results['month']==month])
        wins = len(results[(results['month']==month) & (results['{} Results'.format(model)]=='W')])
        month_games.append(games)
        month_wins.append(wins)
        
    month_win_rate = []
    for x in range(len(month_games)):
        if month_games[x] > 0:
            win_rate = month_wins[x]/month_games[x]
            month_win_rate.append(win_rate)
        else:
            month_win_rate.append(0)
            
    print(month_win_rate)

In [20]:
for model in models:
    month_analysis(model, results)

[0.6611721611721612, 0.5595505617977528, 0.5622641509433962, 0.5359712230215827, 0.5142857142857142, 0, 0, 0, 0, 0.8899082568807339, 0.6733780760626398, 0.6645435244161358]
[0.6355311355311355, 0.5640449438202247, 0.560377358490566, 0.5575539568345323, 0.5142857142857142, 0, 0, 0, 0, 0.8256880733944955, 0.6890380313199105, 0.643312101910828]
[0.6263736263736264, 0.5955056179775281, 0.569811320754717, 0.5575539568345323, 0.6, 0, 0, 0, 0, 0.8256880733944955, 0.6756152125279642, 0.6560509554140127]
[0.6043956043956044, 0.5640449438202247, 0.5867924528301887, 0.5359712230215827, 0.6571428571428571, 0, 0, 0, 0, 0.926605504587156, 0.6733780760626398, 0.6560509554140127]


The model looks like it's pretty consistent up until January. Then from Feb-April it does significantly worse (May has very small sample size). We think this might be due to increase in injuries towards the end of the season, as well as good teams resting star players before the playoffs, and bad teams purposefully tanking to get higher lottery picks for the upcoming draft. Because of this discrpency, we will try to incorporate month as an input variable.