In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Usable Code

In [2]:
!pwd

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [3]:
files = sorted([f for f in os.listdir('csv') if '.csv' in f])

In [4]:
files

['07-08.csv',
 '08-09.csv',
 '09-10.csv',
 '10-11.csv',
 '11-12.csv',
 '12-13.csv',
 '13-14.csv',
 '14-15.csv',
 '15-16.csv',
 '16-17.csv',
 '17-18.csv',
 '18-19.csv']

In [5]:
def clean_odds(file):
    '''
    cleans csv file and returns dataframe object that contains the over/under lines (target)
    '''
    df = pd.read_csv(file)
    df = df.iloc[:,:13] #some files contained unnamed extra rows
    df.dropna(axis=0,inplace=True) #and others columns
    
    #makes a list of game_id's equivalent to how many games were played that season
    mylist = []
    for i in range(1, int(df.shape[0]/2 + 1)):
        mylist.append(i)
        mylist.append(i)
    df['game_id'] = mylist #maps game_id's to games (spread across 2 rows)
    
    df = df.merge(df, on='game_id', suffixes=(None,'_2')) #gets games to be in one row (creates 4/game)
    df = df[1::4] #gets correct mapping
    df.reset_index(drop=True, inplace=True)
    
    #make dates include year so we can join with another dataset later
    df['Date'] = df['Date'].astype('int')
    df['Date'] = df['Date'].astype('str')
    #df['Date'] = df['Date'].str.strip('.0')
    dates = []
    for date in df['Date']:
        if int(date) > 1000:
            date = '20'+file[:2]+date #GET FILENAMES AND USE HERE
            dates.append(date)
        else:
            date = '20'+file[3:5]+'0'+date #GET FILENAMES AND USE HERE
            dates.append(date)
    df['Date'] = dates
    
    #take out pick-em's and replace with zero for later transformation
    df['Open'].replace(['PK','pk'],'0',inplace=True)
    df['Close'].replace(['PK','pk'],'0',inplace=True)
    df['Open_2'].replace(['PK','pk'],'0',inplace=True)
    df['Close_2'].replace(['PK','pk'],'0',inplace=True)
    df.replace('197.5u10','197.5',inplace=True) #one unique occurrence
    #map strings as floats for comparison
    df['Open'] = df['Open'].astype('float')
    df['Open_2'] = df['Open_2'].astype('float')
    df['Close'] = df['Close'].astype('float')
    df['Close_2'] = df['Close_2'].astype('float')
    #get correct over/under line at opening and closing of sportsbook
    df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
    df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
    
    #rename columns to correct home/visitor 
    df['Visitor'] = df['Team']
    df['Home'] = df['Team_2']
    
    #add season column
    df['Season'] = file[:2] + file[3:5]
    
    #drop unneeded info
    df = df[['Date','Home','Visitor','O/U_open','O/U_close','Season']]
    return df

Use this clean_odds function to generate our dataset to combine with our scraped game data.

In [6]:
cd csv

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor/csv


In [7]:
lines = pd.concat([clean_odds(f) for f in files])

In [8]:
lines

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season
0,20071030,SanAntonio,Portland,184.0,189.5,0708
1,20071030,GoldenState,Utah,214.5,212.0,0708
2,20071030,LALakers,Houston,191.0,199.0,0708
3,20071031,Toronto,Philadelphia,190.0,191.0,0708
4,20071031,Indiana,Washington,200.0,203.5,0708
...,...,...,...,...,...,...
1307,20190602,Toronto,GoldenState,216.0,213.5,1819
1308,20190605,GoldenState,Toronto,214.0,209.5,1819
1309,20190607,GoldenState,Toronto,216.0,215.0,1819
1310,20190610,Toronto,GoldenState,212.0,217.0,1819


In [9]:
lines.reset_index(drop=True,inplace=True)
lines

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season
0,20071030,SanAntonio,Portland,184.0,189.5,0708
1,20071030,GoldenState,Utah,214.5,212.0,0708
2,20071030,LALakers,Houston,191.0,199.0,0708
3,20071031,Toronto,Philadelphia,190.0,191.0,0708
4,20071031,Indiana,Washington,200.0,203.5,0708
...,...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5,1819
15516,20190605,GoldenState,Toronto,214.0,209.5,1819
15517,20190607,GoldenState,Toronto,216.0,215.0,1819
15518,20190610,Toronto,GoldenState,212.0,217.0,1819


Let's pickle this initial processing of the csv's and subsequent DataFrame. We can always comeback to this if we need it.

In [10]:
with open('lines.pickle', 'wb') as to_write:
    pickle.dump(lines, to_write)

In [11]:
pwd

'/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor/csv'

In [12]:
cd ..

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [13]:
with open('game_df.pickle','rb') as read_file:
    game_df = pickle.load(read_file)
    
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.538,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.512,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.485,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.552,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.577,.301,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.493,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.523,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.527,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.542,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4


In [14]:
team_abbrev = game_df['home'].unique()
team_abbrev

array(['SAS', 'GSW', 'LAL', 'IND', 'ORL', 'TOR', 'NJN', 'CLE', 'MEM',
       'NOH', 'DEN', 'MIA', 'UTA', 'SEA', 'CHA', 'ATL', 'BOS', 'MIN',
       'CHI', 'LAC', 'PHO', 'PHI', 'WAS', 'DAL', 'HOU', 'MIL', 'NYK',
       'DET', 'SAC', 'POR', 'OKC', 'BRK', 'NOP', 'CHO'], dtype=object)

In [15]:
len(team_abbrev)

34

In [16]:
lines['Home'].unique()

array(['SanAntonio', 'GoldenState', 'LALakers', 'Toronto', 'Indiana',
       'Orlando', 'NewJersey', 'Cleveland', 'Memphis', 'NewOrleans',
       'Denver', 'Miami', 'Utah', 'Seattle', 'Charlotte', 'Atlanta',
       'Boston', 'Minnesota', 'Chicago', 'Phoenix', 'LAClippers',
       'Philadelphia', 'Washington', 'Milwaukee', 'Houston', 'Dallas',
       'NewYork', 'Detroit', 'Sacramento', 'Portland', 'OklahomaCity',
       'Brooklyn'], dtype=object)

In [17]:
team_dict = {
    'SAS': 'SanAntonio',
    'GSW': 'GoldenState',
    'LAL': 'LALakers',
    'TOR': 'Toronto',
    'IND': 'Indiana',
    'ORL': 'Orlando',
    'NJN': 'NewJersey',
    'CLE': 'Cleveland',
    'MEM': 'Memphis',
    'NOH': 'NewOrleans',
    'NOP': 'NewOrleans',
    'DEN': 'Denver',
    'MIA': 'Miami',
    'UTA': 'Utah',
    'SEA': 'Seattle',
    'CHA': 'Charlotte',
    'CHO': 'Charlotte',
    'ATL': 'Atlanta',
    'BOS': 'Boston',
    'MIN': 'Minnesota',
    'CHI': 'Chicago',
    'PHO': 'Phoenix',
    'LAC': 'LAClippers',
    'PHI': 'Philadelphia',
    'WAS': 'Washington',
    'MIL': 'Milwaukee',
    'HOU': 'Houston',
    'DAL': 'Dallas',
    'NYK': 'NewYork',
    'DET': 'Detroit',
    'SAC': 'Sacramento',
    'POR': 'Portland',
    'OKC': 'OklahomaCity',
    'BRK': 'Brooklyn'
}

In [18]:
game_df['home'].map(team_dict)

0         SanAntonio
1        GoldenState
2           LALakers
3            Indiana
4            Orlando
            ...     
15516        Toronto
15517    GoldenState
15518    GoldenState
15519        Toronto
15520    GoldenState
Name: home, Length: 15521, dtype: object

In [19]:
game_df['home_team'] = game_df['home'].map(team_dict)

In [20]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.301,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto


In [21]:
game_df['game_date'] = game_df['game_id'].apply(lambda x: x[:8])

In [22]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando,20071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610


In [23]:
game_df['id'] = game_df['game_date'] + game_df['home_team']

In [24]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date,id
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030,20071030SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030,20071030GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030,20071030LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031,20071031Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando,20071031,20071031Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602,20190602Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605,20190605GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607,20190607GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610,20190610Toronto


In [25]:
lines['id'] = lines['Date'] + lines['Home']

In [26]:
lines

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season,id
0,20071030,SanAntonio,Portland,184.0,189.5,0708,20071030SanAntonio
1,20071030,GoldenState,Utah,214.5,212.0,0708,20071030GoldenState
2,20071030,LALakers,Houston,191.0,199.0,0708,20071030LALakers
3,20071031,Toronto,Philadelphia,190.0,191.0,0708,20071031Toronto
4,20071031,Indiana,Washington,200.0,203.5,0708,20071031Indiana
...,...,...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5,1819,20190602Toronto
15516,20190605,GoldenState,Toronto,214.0,209.5,1819,20190605GoldenState
15517,20190607,GoldenState,Toronto,216.0,215.0,1819,20190607GoldenState
15518,20190610,Toronto,GoldenState,212.0,217.0,1819,20190610Toronto


In [27]:
lines.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15520 entries, 0 to 15519
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       15520 non-null  object 
 1   Home       15520 non-null  object 
 2   Visitor    15520 non-null  object 
 3   O/U_open   15520 non-null  float64
 4   O/U_close  15520 non-null  float64
 5   Season     15520 non-null  object 
 6   id         15520 non-null  object 
dtypes: float64(2), object(5)
memory usage: 848.9+ KB


In [28]:
game_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15521 entries, 0 to 15520
Data columns (total 71 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   game_id       15521 non-null  object
 1   total         15521 non-null  int64 
 2   away          15521 non-null  object
 3   home          15521 non-null  object
 4   pace_v        15521 non-null  object
 5   pace_h        15521 non-null  object
 6   eFg_v         15521 non-null  object
 7   eFg_h         15521 non-null  object
 8   tov_v         15521 non-null  object
 9   tov_h         15521 non-null  object
 10  orb_v         15521 non-null  object
 11  orb_h         15521 non-null  object
 12  ft_fga_v      15521 non-null  object
 13  ft_fga_h      15521 non-null  object
 14  ortg_v        15521 non-null  object
 15  ortg_h        15521 non-null  object
 16  fg_v          15521 non-null  object
 17  fga_v         15521 non-null  object
 18  fg_per_v      15521 non-null  object
 19  thre

Now both databases have a column to merge on. The 'id' column.

In [29]:
df = pd.merge(lines,game_df,on='id')

In [30]:
df

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season,id,game_id,total,away,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date
0,20071030,SanAntonio,Portland,184.0,189.5,0708,20071030SanAntonio,200710300SAS,203,POR,...,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030
1,20071030,GoldenState,Utah,214.5,212.0,0708,20071030GoldenState,200710300GSW,213,UTA,...,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030
2,20071030,LALakers,Houston,191.0,199.0,0708,20071030LALakers,200710300LAL,188,HOU,...,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030
3,20071031,Toronto,Philadelphia,190.0,191.0,0708,20071031Toronto,200710310TOR,203,PHI,...,.200,64.3,44.6,56.1,8.8,6.2,100.0,106.8,Toronto,20071031
4,20071031,Indiana,Washington,200.0,203.5,0708,20071031Indiana,200710310IND,229,WAS,...,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,Toronto,GoldenState,216.0,213.5,1819,20190602Toronto,201906020TOR,213,GSW,...,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602
15011,20190605,GoldenState,Toronto,214.0,209.5,1819,20190605GoldenState,201906050GSW,232,TOR,...,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605
15012,20190607,GoldenState,Toronto,216.0,215.0,1819,20190607GoldenState,201906070GSW,197,TOR,...,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607
15013,20190610,Toronto,GoldenState,212.0,217.0,1819,20190610Toronto,201906100TOR,211,GSW,...,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610


In [31]:
df.shape

(15015, 77)

Drop columns that share the same information.

In [32]:
df.drop(columns=['Home','Visitor','game_id','home_team','game_date'],inplace=True)

In [33]:
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,pace_v,pace_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,20071030,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,.538,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7
1,20071030,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,.512,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3
2,20071030,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,93.0,93.0,...,.485,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1
3,20071031,190.0,191.0,0708,20071031Toronto,203,PHI,TOR,90.9,90.9,...,.573,.188,.200,64.3,44.6,56.1,8.8,6.2,100.0,106.8
4,20071031,200.0,203.5,0708,20071031Indiana,229,WAS,IND,98.4,98.4,...,.552,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,216.0,213.5,1819,20190602Toronto,213,GSW,TOR,100.0,100.0,...,.493,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0
15011,20190605,214.0,209.5,1819,20190605GoldenState,232,TOR,GSW,99.6,99.6,...,.523,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5
15012,20190607,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,.527,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7
15013,20190610,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,.542,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4


Getting the difference between the line and the outcome of a particular game in order to classify our game as an "Over" or "Under".

In [34]:
df['ou1'] = df['total'] - df['O/U_open']
df['ou2'] = df['total'] - df['O/U_close']

Function to label our target based off information we have in our DataFrame, and then applying that function to our newly created columns that currently classify our games based on a positive or negative value. This simply puts a more general categorical label over that more granular scalar indicator.

In [35]:
def over_under(ou):
    if ou > 0:
        return 1
    elif ou == 0:
        return 2
    else:
        return 0

In [36]:
df['Over/Under_open'] = df['ou1'].apply(over_under)
df['Over/Under_close'] = df['ou2'].apply(over_under)

CRITICAL ASSUMPTION: Making the decision to drop "pushes". A "push" is when a betting line is hit exactly, and no money exchanges hands. In the context of this project, a total could be at 212 points, and if the game finishes with exactly 212 points, then every bettor gets their money back, on both sides (over/under 212), and the sportsbook doesn't collect anything.

In [37]:
df = df[df['Over/Under_open'] != 2]
df = df[df['Over/Under_close'] != 2]

About 400 instances of a "push", or about 2.7% of our original dataset. This number can be remembered for sampling or simulation purposes later.

Convert date to pd.datetime object. Makes plotting the time series compatible with matplotlib, and building season dictionary later.

In [38]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')

### Stat building (feature engineering)

We want to ultimately end up with a model that takes both teams' recent performances and can make a prediction on the total, so we can make a new column to hold home/visitor every other row, and then rename the _h and _v columns for self, opponent.

In [39]:
d = df.reindex(df.index.repeat(2)).reset_index(drop=True)

Now that we have every game repeated, let's create a column to indicate which team these self/opponent stats represent. We can do this by creating an empty column and selectively copy from the 'home' and 'away' columns that repeat across the 2 rows. Then we'll do that same for the opponent.

In [40]:
d['team'] = np.nan
#home team will be all even indexes of this dataset
d['team'][::2] = d['home'][::2]
#away team will be all odd indexes of this dataset
d['team'][1::2] = d['away'][1::2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['team'][::2] = d['home'][::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['team'][1::2] = d['away'][1::2]


In [41]:
d['opp'] = np.nan
d['opp'][::2] = d['away'][::2]
d['opp'][1::2] = d['home'][1::2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['opp'][::2] = d['away'][::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['opp'][1::2] = d['home'][1::2]


Let's keep track of home/away, can't hurt. We'll use the same process as we just used. Note: this can also be used as a sanity check to make sure that our splitting worked correctly. If it did, we should see 'H', 'V' every other row.

In [42]:
d['home/away'] = np.nan
#home team will be all even indexes of this dataset
d['home/away'][::2] = 'H'
#away team will be all odd indexes of this dataset
d['home/away'][1::2] = 'V'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['home/away'][::2] = 'H'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['home/away'][1::2] = 'V'


Now we have to map the _v and _h stats appropriate to new columns for self or opponent stats.

In [43]:
stats = ['pace','eFg','tov','orb','ft_fga','ortg','fg','fga','fg_per','threes','threes_att','threes_per',
         'ft','fta','ft_per','drb','trb','ast','stl','blk','to','fouls','ts_per','threes_ar','ft_ar',
         'drb_per','trb_per','ast_per','stl_per','blk_per','user_per','drtg']
for stat in stats:
    d['{}'.format(stat)] = ""
    d['{}'.format(stat)][::2] = d['{}_h'.format(stat)][::2]
    d['{}'.format(stat)][1::2] = d['{}_v'.format(stat)][1::2]
    d['{}_opp'.format(stat)] = ""
    d['{}_opp'.format(stat)][::2] = d['{}_v'.format(stat)][::2]
    d['{}_opp'.format(stat)][1::2] = d['{}_h'.format(stat)][1::2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['{}'.format(stat)][::2] = d['{}_h'.format(stat)][::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['{}'.format(stat)][1::2] = d['{}_v'.format(stat)][1::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['{}_opp'.format(stat)][::2] = d['{}_v'.format(stat)][::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

Then we can drop all of our columns with '_v' & '_h' since they contain extra, and now, redundant information.

In [44]:
mylist = []
for stat in stats:
    mylist.append('{}_v'.format(stat))
    mylist.append('{}_h'.format(stat))
#print(mylist)
d = d.drop(columns=mylist)

Now our DataFrame (d) has stats for each team and their opponent for every game. Now let's get some rolling averages. First we'll create an empty column that we'll populate with a rolling count for every time that team has appeared. This will get us the number of games each team has played up INCLUDING that game for each season. We can use that info to incorporate some rolling averages.

In [45]:
d['team_season'] = d['team'] + d['Season']

In [46]:
d['game_num'] = d.groupby('team_season').cumcount()+1

Now we are ready to compute some rolling average's in a certain season. Remember, we want to get a snapshot of how both team's have perfromed recently (last 5 games), and get a classification from this info. So let's go back to our method of creating empty columns we can assign data to.

In [47]:
for stat in d.iloc[:,15:79]:
    d['{}_rolling'.format(stat)] = np.nan

Cast our stat columns as floats to perform some operations on them.

In [48]:
for col in d.iloc[:,15:79].columns:
    d[col] = d[col].astype('float')

This may not be the most pythonic way to populate these new columns, but it works, and doesn't take long. At a high level, we are going team by team, then season by season, for that team, and applying 5 game rolling averages to their stats and opponent stats.

In [49]:
for team in d['team'].unique():
    mask = d['team'] == team
    d4 = d[mask]
    for season in d4['Season'].unique():
        mask = d4['Season'] == '{}'.format(season)
        d5 = d4[mask]
        for stat in list(d5.iloc[:,15:79].columns):
            d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)
        d.update(d5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)


IMPORTANT NOTE: this DataFrame contains rows with NaN's. Every game that is in the first 5 of the season has no rolling averages, this can be problematic when we take this DataFrame and try to feed it into a sk-learn ML model. We can address this now, by simply dropping these games(~5% of our dataset), or apply the previous season's median rolling average. This is where some decisions need to be made, and I would advise just dropping the games for 2 reasons. One, taking out 5 games won't change the application our model will have as there are 67-75 games left in the year we can apply this model to once in production. Two, a median of the rolling averages, or any other kind of congregation statistic being applied over different seasons is mostly likely not a sound choice in light of team personnel turnover, from retirings, trades, the draft, and free agent moves; not to mention new coaching staffs. This could be applicable to a few teams (ones that experience little of this roster turnover), but not enough to apply that thinking across the entire dataset.

In [50]:
d

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling
0,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203.0,POR,SAS,19.0,13.5,...,,,,,,,,,,
1,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203.0,POR,SAS,19.0,13.5,...,,,,,,,,,,
2,2007-10-30,214.5,212.0,0708,20071030GoldenState,213.0,UTA,GSW,-1.5,1.0,...,,,,,,,,,,
3,2007-10-30,214.5,212.0,0708,20071030GoldenState,213.0,UTA,GSW,-1.5,1.0,...,,,,,,,,,,
4,2007-10-30,191.0,199.0,0708,20071030LALakers,188.0,HOU,LAL,-3.0,-11.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,...,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,59.84,76.36,8.86,7.18,10.56,9.24,100.0,100.0,108.36,116.22
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224.0,TOR,GSW,13.0,12.5,...,77.92,58.72,6.60,8.50,8.90,11.10,100.0,100.0,115.12,108.88


We won't cut off these rows in this notebook, but can easily do so in our modeling notebook by using the following mask and declaring this filtered d as our new d.

In [51]:
d[d['game_num']>5]

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling
129,2007-11-09,188.0,187.5,0708,20071109Philadelphia,208.0,TOR,PHI,20.0,20.5,...,59.00,65.50,8.76,4.34,6.32,9.14,100.0,100.0,104.40,106.62
135,2007-11-09,198.5,201.5,0708,20071109NewYork,214.0,ORL,NYK,15.5,12.5,...,57.58,52.62,4.88,5.94,6.50,8.06,100.0,100.0,105.86,111.16
140,2007-11-09,188.0,186.0,0708,20071109NewOrleans,182.0,SAS,NOH,-6.0,-4.0,...,57.90,59.48,8.08,5.96,6.76,6.48,100.0,100.0,99.96,110.74
141,2007-11-09,188.0,186.0,0708,20071109NewOrleans,182.0,SAS,NOH,-6.0,-4.0,...,58.94,56.28,8.92,6.30,8.36,7.42,100.0,100.0,98.98,105.48
143,2007-11-09,210.5,209.5,0708,20071109Washington,210.0,DEN,WAS,-0.5,0.5,...,60.84,62.20,8.48,11.20,9.70,7.28,100.0,100.0,104.26,101.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,...,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,59.84,76.36,8.86,7.18,10.56,9.24,100.0,100.0,108.36,116.22
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224.0,TOR,GSW,13.0,12.5,...,77.92,58.72,6.60,8.50,8.90,11.10,100.0,100.0,115.12,108.88


The final thing we have to do before we can begin classifying our games is make each row one game. Our features will include rolling averages for each of our team's and opponent's stats. From here we can get a baseline sense if our logic, of recent past performances, is good basis for classification.

In [59]:
pd.set_option("display.max_columns", 300)
pd.set_option("display.max_rows", 101)

In [61]:
df = d.merge(d, on='id', suffixes=(None,'_v')) #gets games to be in one row (creates 4/game)
#df.tail(8)
df = df[1::4] #gets correct mapping
df.reset_index(drop=True, inplace=True)

In [62]:
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,Over/Under_open,Over/Under_close,team,opp,home/away,pace,pace_opp,eFg,eFg_opp,tov,tov_opp,orb,orb_opp,ft_fga,ft_fga_opp,ortg,ortg_opp,fg,fg_opp,fga,fga_opp,fg_per,fg_per_opp,threes,threes_opp,threes_att,threes_att_opp,threes_per,threes_per_opp,ft,ft_opp,fta,fta_opp,ft_per,ft_per_opp,drb,drb_opp,trb,trb_opp,ast,ast_opp,stl,stl_opp,blk,blk_opp,to,to_opp,fouls,fouls_opp,ts_per,ts_per_opp,threes_ar,threes_ar_opp,ft_ar,ft_ar_opp,drb_per,drb_per_opp,trb_per,trb_per_opp,ast_per,ast_per_opp,stl_per,stl_per_opp,blk_per,blk_per_opp,user_per,user_per_opp,drtg,drtg_opp,team_season,game_num,pace_rolling,pace_opp_rolling,eFg_rolling,eFg_opp_rolling,tov_rolling,tov_opp_rolling,orb_rolling,orb_opp_rolling,ft_fga_rolling,ft_fga_opp_rolling,ortg_rolling,ortg_opp_rolling,fg_rolling,fg_opp_rolling,fga_rolling,fga_opp_rolling,fg_per_rolling,fg_per_opp_rolling,threes_rolling,threes_opp_rolling,threes_att_rolling,threes_att_opp_rolling,threes_per_rolling,threes_per_opp_rolling,ft_rolling,ft_opp_rolling,fta_rolling,fta_opp_rolling,ft_per_rolling,ft_per_opp_rolling,drb_rolling,drb_opp_rolling,trb_rolling,trb_opp_rolling,ast_rolling,ast_opp_rolling,stl_rolling,stl_opp_rolling,blk_rolling,blk_opp_rolling,to_rolling,to_opp_rolling,fouls_rolling,fouls_opp_rolling,ts_per_rolling,ts_per_opp_rolling,threes_ar_rolling,threes_ar_opp_rolling,ft_ar_rolling,ft_ar_opp_rolling,drb_per_rolling,drb_per_opp_rolling,trb_per_rolling,trb_per_opp_rolling,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling,Date_v,O/U_open_v,O/U_close_v,Season_v,total_v,away_v,home_v,ou1_v,ou2_v,Over/Under_open_v,Over/Under_close_v,team_v,opp_v,home/away_v,pace_v,pace_opp_v,eFg_v,eFg_opp_v,tov_v,tov_opp_v,orb_v,orb_opp_v,ft_fga_v,ft_fga_opp_v,ortg_v,ortg_opp_v,fg_v,fg_opp_v,fga_v,fga_opp_v,fg_per_v,fg_per_opp_v,threes_v,threes_opp_v,threes_att_v,threes_att_opp_v,threes_per_v,threes_per_opp_v,ft_v,ft_opp_v,fta_v,fta_opp_v,ft_per_v,ft_per_opp_v,drb_v,drb_opp_v,trb_v,trb_opp_v,ast_v,ast_opp_v,stl_v,stl_opp_v,blk_v,blk_opp_v,to_v,to_opp_v,fouls_v,fouls_opp_v,ts_per_v,ts_per_opp_v,threes_ar_v,threes_ar_opp_v,ft_ar_v,ft_ar_opp_v,drb_per_v,drb_per_opp_v,trb_per_v,trb_per_opp_v,ast_per_v,ast_per_opp_v,stl_per_v,stl_per_opp_v,blk_per_v,blk_per_opp_v,user_per_v,user_per_opp_v,drtg_v,drtg_opp_v,team_season_v,game_num_v,pace_rolling_v,pace_opp_rolling_v,eFg_rolling_v,eFg_opp_rolling_v,tov_rolling_v,tov_opp_rolling_v,orb_rolling_v,orb_opp_rolling_v,ft_fga_rolling_v,ft_fga_opp_rolling_v,ortg_rolling_v,ortg_opp_rolling_v,fg_rolling_v,fg_opp_rolling_v,fga_rolling_v,fga_opp_rolling_v,fg_per_rolling_v,fg_per_opp_rolling_v,threes_rolling_v,threes_opp_rolling_v,threes_att_rolling_v,threes_att_opp_rolling_v,threes_per_rolling_v,threes_per_opp_rolling_v,ft_rolling_v,ft_opp_rolling_v,fta_rolling_v,fta_opp_rolling_v,ft_per_rolling_v,ft_per_opp_rolling_v,drb_rolling_v,drb_opp_rolling_v,trb_rolling_v,trb_opp_rolling_v,ast_rolling_v,ast_opp_rolling_v,stl_rolling_v,stl_opp_rolling_v,blk_rolling_v,blk_opp_rolling_v,to_rolling_v,to_opp_rolling_v,fouls_rolling_v,fouls_opp_rolling_v,ts_per_rolling_v,ts_per_opp_rolling_v,threes_ar_rolling_v,threes_ar_opp_rolling_v,ft_ar_rolling_v,ft_ar_opp_rolling_v,drb_per_rolling_v,drb_per_opp_rolling_v,trb_per_rolling_v,trb_per_opp_rolling_v,ast_per_rolling_v,ast_per_opp_rolling_v,stl_per_rolling_v,stl_per_opp_rolling_v,blk_per_rolling_v,blk_per_opp_rolling_v,user_per_rolling_v,user_per_opp_rolling_v,drtg_rolling_v,drtg_opp_rolling_v
0,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203.0,POR,SAS,19.0,13.5,1.0,1.0,SAS,POR,H,91.8,91.8,0.506,0.538,7.5,15.8,27.3,22.2,0.207,0.167,115.5,105.7,240.0,240.0,41.0,39.0,87.0,78.0,0.471,0.500,6.0,6.0,24.0,13.0,0.250,0.462,18.0,13.0,26.0,17.0,12.0,8.0,28.0,32.0,40.0,40.0,21.0,15.0,8.0,1.0,4.0,4.0,8.0,16.0,0.538,0.567,0.276,0.167,0.299,0.218,77.8,72.7,50.0,50.0,51.2,38.5,8.7,1.1,6.2,6.3,100.0,100.0,105.7,115.5,SAS0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-10-30,184.0,189.5,0708,203.0,POR,SAS,19.0,13.5,1.0,1.0,POR,SAS,V,91.8,91.8,0.538,0.506,15.8,7.5,22.2,27.3,0.167,0.207,105.7,115.5,240.0,240.0,39.0,41.0,78.0,87.0,0.500,0.471,6.0,6.0,13.0,24.0,0.462,0.250,13.0,18.0,17.0,26.0,8.0,12.0,32.0,28.0,40.0,40.0,15.0,21.0,1.0,8.0,4.0,4.0,16.0,8.0,0.567,0.538,0.167,0.276,0.218,0.299,72.7,77.8,50.0,50.0,38.5,51.2,1.1,8.7,6.3,6.2,100.0,100.0,115.5,105.7,POR0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2007-10-30,214.5,212.0,0708,20071030GoldenState,213.0,UTA,GSW,-1.5,1.0,0.0,1.0,GSW,UTA,H,105.1,105.1,0.455,0.483,17.6,15.2,14.9,34.8,0.338,0.333,91.3,111.3,240.0,240.0,32.0,41.0,77.0,90.0,0.416,0.456,6.0,5.0,23.0,11.0,0.261,0.455,26.0,30.0,38.0,36.0,7.0,16.0,30.0,40.0,37.0,56.0,19.0,24.0,8.0,9.0,9.0,7.0,20.0,19.0,0.512,0.553,0.299,0.122,0.494,0.400,65.2,85.1,39.8,60.2,59.4,58.5,7.6,8.6,11.4,13.0,100.0,100.0,111.3,91.3,GSW0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-10-30,214.5,212.0,0708,213.0,UTA,GSW,-1.5,1.0,0.0,1.0,UTA,GSW,V,105.1,105.1,0.483,0.455,15.2,17.6,34.8,14.9,0.333,0.338,111.3,91.3,240.0,240.0,41.0,32.0,90.0,77.0,0.456,0.416,5.0,6.0,11.0,23.0,0.455,0.261,30.0,26.0,36.0,38.0,16.0,7.0,40.0,30.0,56.0,37.0,24.0,19.0,9.0,8.0,7.0,9.0,19.0,20.0,0.553,0.512,0.122,0.299,0.400,0.494,85.1,65.2,60.2,39.8,58.5,59.4,8.6,7.6,13.0,11.4,100.0,100.0,91.3,111.3,UTA0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2007-10-30,191.0,199.0,0708,20071030LALakers,188.0,HOU,LAL,-3.0,-11.0,0.0,0.0,LAL,HOU,H,93.0,93.0,0.434,0.500,11.1,17.0,22.9,31.6,0.355,0.284,99.9,102.1,240.0,240.0,32.0,34.0,76.0,74.0,0.421,0.459,2.0,6.0,8.0,22.0,0.250,0.273,27.0,21.0,45.0,31.0,11.0,12.0,26.0,37.0,37.0,49.0,18.0,23.0,16.0,10.0,3.0,5.0,12.0,18.0,0.485,0.542,0.105,0.297,0.592,0.419,68.4,77.1,43.0,57.0,56.3,67.6,17.2,10.7,5.8,7.4,100.0,100.0,102.1,99.9,LAL0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-10-30,191.0,199.0,0708,188.0,HOU,LAL,-3.0,-11.0,0.0,0.0,HOU,LAL,V,93.0,93.0,0.500,0.434,17.0,11.1,31.6,22.9,0.284,0.355,102.1,99.9,240.0,240.0,34.0,32.0,74.0,76.0,0.459,0.421,6.0,2.0,22.0,8.0,0.273,0.250,21.0,27.0,31.0,45.0,12.0,11.0,37.0,26.0,49.0,37.0,23.0,18.0,10.0,16.0,5.0,3.0,18.0,12.0,0.542,0.485,0.297,0.105,0.419,0.592,77.1,68.4,57.0,43.0,67.6,56.3,10.7,17.2,7.4,5.8,100.0,100.0,99.9,102.1,HOU0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2007-10-31,190.0,191.0,0708,20071031Toronto,203.0,PHI,TOR,13.0,12.0,1.0,1.0,TOR,PHI,H,90.9,90.9,0.535,0.506,9.8,15.6,24.4,35.7,0.176,0.171,116.7,106.8,240.0,240.0,41.0,38.0,85.0,82.0,0.482,0.463,9.0,7.0,16.0,17.0,0.563,0.412,15.0,14.0,17.0,23.0,10.0,15.0,27.0,31.0,37.0,46.0,23.0,22.0,8.0,3.0,4.0,9.0,10.0,17.0,0.573,0.526,0.188,0.207,0.200,0.280,64.3,75.6,44.6,55.4,56.1,57.9,8.8,3.3,6.2,13.0,100.0,100.0,106.8,116.7,TOR0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-10-31,190.0,191.0,0708,203.0,PHI,TOR,13.0,12.0,1.0,1.0,PHI,TOR,V,90.9,90.9,0.506,0.535,15.6,9.8,35.7,24.4,0.171,0.176,106.8,116.7,240.0,240.0,38.0,41.0,82.0,85.0,0.463,0.482,7.0,9.0,17.0,16.0,0.412,0.563,14.0,15.0,23.0,17.0,15.0,10.0,31.0,27.0,46.0,37.0,22.0,23.0,3.0,8.0,9.0,4.0,17.0,10.0,0.526,0.573,0.207,0.188,0.280,0.200,75.6,64.3,55.4,44.6,57.9,56.1,3.3,8.8,13.0,6.2,100.0,100.0,116.7,106.8,PHI0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,2007-10-31,200.0,203.5,0708,20071031Indiana,229.0,WAS,IND,29.0,25.5,1.0,1.0,IND,WAS,H,98.4,98.4,0.484,0.394,13.6,13.2,29.8,35.4,0.326,0.323,109.5,101.2,265.0,265.0,38.0,36.0,92.0,99.0,0.413,0.364,13.0,6.0,29.0,20.0,0.448,0.300,30.0,32.0,36.0,45.0,14.0,23.0,42.0,33.0,56.0,56.0,20.0,15.0,12.0,6.0,6.0,6.0,17.0,18.0,0.552,0.463,0.315,0.202,0.391,0.455,64.6,70.2,50.0,50.0,52.6,41.7,11.0,5.5,7.6,9.5,100.0,100.0,101.2,109.5,IND0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2007-10-31,200.0,203.5,0708,229.0,WAS,IND,29.0,25.5,1.0,1.0,WAS,IND,V,98.4,98.4,0.394,0.484,13.2,13.6,35.4,29.8,0.323,0.326,101.2,109.5,265.0,265.0,36.0,38.0,99.0,92.0,0.364,0.413,6.0,13.0,20.0,29.0,0.300,0.448,32.0,30.0,45.0,36.0,23.0,14.0,33.0,42.0,56.0,56.0,15.0,20.0,6.0,12.0,6.0,6.0,18.0,17.0,0.463,0.552,0.202,0.315,0.455,0.391,70.2,64.6,50.0,50.0,41.7,52.6,5.5,11.0,9.5,7.6,100.0,100.0,109.5,101.2,WAS0708,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14617,2019-06-02,216.0,213.5,1819,20190602Toronto,213.0,GSW,TOR,-3.0,-0.5,0.0,0.0,TOR,GSW,H,100.0,100.0,0.431,0.543,12.5,14.0,29.4,15.0,0.245,0.244,104.0,109.0,240.0,240.0,35.0,38.0,94.0,82.0,0.372,0.463,11.0,13.0,38.0,34.0,0.289,0.382,23.0,20.0,26.0,23.0,15.0,6.0,34.0,36.0,49.0,42.0,17.0,34.0,8.0,7.0,2.0,5.0,15.0,15.0,0.493,0.592,0.404,0.415,0.277,0.280,85.0,70.6,53.8,46.2,48.6,89.5,8.0,7.0,4.2,8.9,100.0,100.0,109.0,104.0,TOR1819,97.0,93.46,93.46,0.5324,0.4954,9.88,12.56,18.28,21.68,0.2694,0.2498,115.90,106.54,250.0,250.0,37.4,35.4,84.8,83.8,0.4448,0.4248,14.8,11.8,37.8,35.0,0.3952,0.3390,22.6,20.6,27.6,27.8,8.2,9.8,35.4,37.4,43.6,47.2,24.8,25.2,7.8,7.4,5.8,4.6,10.8,14.0,0.5822,0.5406,0.4436,0.4176,0.3294,0.3346,78.32,81.72,48.20,51.80,65.80,71.20,7.92,7.34,11.68,9.66,100.0,100.0,106.54,115.90,2019-06-02,216.0,213.5,1819,213.0,GSW,TOR,-3.0,-0.5,0.0,0.0,GSW,TOR,V,100.0,100.0,0.543,0.431,14.0,12.5,15.0,29.4,0.244,0.245,109.0,104.0,240.0,240.0,38.0,35.0,82.0,94.0,0.463,0.372,13.0,11.0,34.0,38.0,0.382,0.289,20.0,23.0,23.0,26.0,6.0,15.0,36.0,34.0,42.0,49.0,34.0,17.0,7.0,8.0,5.0,2.0,15.0,15.0,0.592,0.493,0.415,0.404,0.280,0.277,70.6,85.0,46.2,53.8,89.5,48.6,7.0,8.0,8.9,4.2,100.0,100.0,104.0,109.0,GSW1819,98.0,95.60,95.60,0.5484,0.5136,13.42,12.10,26.20,21.24,0.2394,0.2360,116.52,110.58,245.0,245.0,41.0,37.6,85.4,85.8,0.4802,0.4378,11.6,13.0,31.6,35.2,0.3656,0.3620,20.0,19.6,23.6,24.8,10.8,9.8,36.2,30.0,47.0,39.8,29.2,24.4,8.4,6.6,5.8,4.2,14.8,13.4,0.5942,0.5586,0.3696,0.4098,0.2812,0.2976,78.76,73.80,54.02,45.98,71.84,65.06,8.58,6.78,11.38,7.94,100.0,100.0,110.58,116.52
14618,2019-06-05,214.0,209.5,1819,20190605GoldenState,232.0,TOR,GSW,18.0,22.5,1.0,1.0,GSW,TOR,H,99.6,99.6,0.462,0.628,11.8,13.3,27.1,15.2,0.275,0.244,109.5,123.5,240.0,240.0,36.0,43.0,91.0,82.0,0.396,0.524,12.0,17.0,36.0,38.0,0.333,0.447,25.0,20.0,30.0,21.0,13.0,5.0,28.0,35.0,41.0,40.0,25.0,30.0,8.0,9.0,3.0,10.0,14.0,14.0,0.523,0.674,0.396,0.463,0.330,0.256,84.8,72.9,50.6,49.4,69.4,69.8,8.0,9.0,6.8,18.2,100.0,100.0,123.5,109.5,GSW1819,99.0,96.54,96.54,0.5368,0.5190,13.58,11.02,23.96,20.32,0.2524,0.2200,113.96,111.64,245.0,245.0,40.2,38.6,85.0,88.0,0.4728,0.4400,10.8,13.8,31.8,37.2,0.3390,0.3698,21.0,18.8,24.6,23.8,9.8,9.6,37.2,30.6,47.0,40.2,30.0,23.6,7.2,7.2,5.2,3.6,15.0,12.2,0.5864,0.5600,0.3740,0.4232,0.2944,0.2784,79.68,76.04,53.82,46.18,75.46,60.78,7.26,7.34,10.26,6.82,100.0,100.0,111.64,113.96,2019-06-05,214.0,209.5,1819,232.0,TOR,GSW,18.0,22.5,1.0,1.0,TOR,GSW,V,99.6,99.6,0.628,0.462,13.3,11.8,15.2,27.1,0.244,0.275,123.5,109.5,240.0,240.0,43.0,36.0,82.0,91.0,0.524,0.396,17.0,12.0,38.0,36.0,0.447,0.333,20.0,25.0,21.0,30.0,5.0,13.0,35.0,28.0,40.0,41.0,30.0,25.0,9.0,8.0,10.0,3.0,14.0,14.0,0.674,0.523,0.463,0.396,0.256,0.330,72.9,84.8,49.4,50.6,69.8,69.4,9.0,8.0,18.2,6.8,100.0,100.0,109.5,123.5,TOR1819,98.0,93.54,93.54,0.5236,0.5158,9.78,12.44,21.40,20.34,0.2772,0.2554,117.08,109.72,240.0,240.0,36.4,35.4,83.2,79.8,0.4408,0.4428,13.6,11.6,36.4,33.0,0.3774,0.3518,23.0,20.2,27.6,25.8,9.6,8.4,32.8,34.6,42.4,43.0,22.6,27.6,7.2,6.0,4.2,4.6,10.4,13.0,0.5768,0.5628,0.4362,0.4144,0.3338,0.3258,79.66,78.60,49.64,50.36,61.52,77.52,7.70,6.42,9.08,9.68,100.0,100.0,109.72,117.08
14619,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,0.0,0.0,GSW,TOR,H,94.8,94.8,0.500,0.477,16.3,8.5,20.0,17.1,0.179,0.267,97.0,110.7,240.0,240.0,35.0,36.0,78.0,86.0,0.449,0.419,8.0,10.0,27.0,32.0,0.296,0.313,14.0,23.0,21.0,24.0,8.0,7.0,34.0,32.0,42.0,39.0,26.0,22.0,6.0,12.0,6.0,4.0,17.0,9.0,0.527,0.544,0.346,0.372,0.269,0.279,82.9,80.0,51.9,48.1,74.3,61.1,6.3,12.7,11.1,7.8,100.0,100.0,110.7,97.0,GSW1819,100.0,97.34,97.34,0.5158,0.5352,13.02,11.24,23.22,19.20,0.2562,0.2292,112.00,113.10,245.0,245.0,39.0,39.6,86.8,87.2,0.4496,0.4564,11.4,13.6,33.2,37.0,0.3436,0.3668,21.8,19.4,25.2,24.4,10.0,8.6,35.2,32.2,45.2,40.8,29.0,25.0,8.0,7.4,4.0,5.4,14.6,12.4,0.5696,0.5766,0.3824,0.4252,0.2946,0.2878,80.80,76.78,52.44,47.56,75.06,62.64,8.02,7.46,7.80,10.08,100.0,100.0,113.10,112.00,2019-06-07,216.0,215.0,1819,197.0,TOR,GSW,-19.0,-18.0,0.0,0.0,TOR,GSW,V,94.8,94.8,0.477,0.500,8.5,16.3,17.1,20.0,0.267,0.179,110.7,97.0,240.0,240.0,36.0,35.0,86.0,78.0,0.419,0.449,10.0,8.0,32.0,27.0,0.313,0.296,23.0,14.0,24.0,21.0,7.0,8.0,32.0,34.0,39.0,42.0,22.0,26.0,12.0,6.0,4.0,6.0,9.0,17.0,0.544,0.527,0.372,0.346,0.279,0.269,80.0,82.9,48.1,51.9,61.1,74.3,12.7,6.3,7.8,11.1,100.0,100.0,97.0,110.7,TOR1819,99.0,94.28,94.28,0.5388,0.5020,10.44,12.48,19.78,22.34,0.2708,0.2678,116.76,110.36,240.0,240.0,36.8,35.2,82.2,82.0,0.4514,0.4294,14.2,11.8,35.8,33.2,0.3986,0.3556,22.2,21.8,26.4,26.6,8.6,9.6,33.0,33.6,41.6,43.2,22.2,26.6,7.2,6.0,5.2,3.8,11.0,13.4,0.5902,0.5558,0.4346,0.4060,0.3230,0.3268,77.66,80.22,49.04,50.96,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
14620,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,0.0,0.0,TOR,GSW,H,94.3,94.3,0.494,0.585,11.8,14.5,29.5,16.7,0.247,0.122,111.3,112.4,240.0,240.0,38.0,38.0,85.0,82.0,0.447,0.463,8.0,20.0,32.0,42.0,0.250,0.476,21.0,10.0,27.0,14.0,13.0,6.0,30.0,31.0,43.0,37.0,19.0,27.0,6.0,5.0,5.0,7.0,13.0,15.0,0.542,0.601,0.376,0.512,0.318,0.171,83.3,70.5,53.8,46.3,50.0,71.1,6.4,5.3,12.5,13.2,100.0,100.0,112.4,111.3,TOR1819,100.0,94.72,94.72,0.5390,0.4996,10.98,13.60,19.56,22.62,0.2646,0.2726,116.22,108.36,240.0,240.0,37.8,34.6,82.6,80.8,0.4614,0.4288,12.6,11.4,33.6,32.4,0.3774,0.3502,21.8,22.0,25.0,27.2,8.0,9.6,32.4,31.4,40.4,41.0,22.8,26.6,8.4,6.8,5.2,4.6,11.6,14.6,0.5914,0.5534,0.4066,0.4014,0.3050,0.3378,77.38,80.44,49.48,50.52,59.84,76.36,8.86,7.18,10.56,9.24,100.0,100.0,108.36,116.22,2019-06-10,212.0,217.0,1819,211.0,GSW,TOR,-1.0,-6.0,0.0,0.0,GSW,TOR,V,94.3,94.3,0.585,0.494,14.5,11.8,16.7,29.5,0.122,0.247,112.4,111.3,240.0,240.0,38.0,38.0,82.0,85.0,0.463,0.447,20.0,8.0,42.0,32.0,0.476,0.250,10.0,21.0,14.0,27.0,6.0,13.0,31.0,30.0,37.0,43.0,27.0,19.0,5.0,6.0,7.0,5.0,15.0,13.0,0.601,0.542,0.512,0.376,0.171,0.318,70.5,83.3,46.3,53.8,71.1,50.0,5.3,6.4,13.2,12.5,100.0,100.0,111.3,112.4,GSW1819,101.0,96.66,96.66,0.5086,0.5376,13.86,10.94,23.54,18.78,0.2444,0.2356,109.00,115.08,245.0,245.0,37.8,40.0,85.6,87.4,0.4418,0.4602,11.4,13.4,33.4,36.4,0.3412,0.3666,20.6,20.0,24.8,22.6,10.2,8.0,33.6,32.4,43.8,40.4,28.8,25.0,7.4,8.0,4.2,5.6,15.4,12.0,0.5582,0.5860,0.3896,0.4172,0.2936,0.2660,81.22,76.46,51.94,48.06,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00


Now that our data is fully processed, we are ready to process this in a classification algorithm

In [63]:
with open('d_rolling.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

## Scratch Work

In [None]:
df = pd.read_csv(files[2])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.iloc[:,:13]

In [None]:
df.info()

In [None]:
df.shape[0]

In [None]:
mylist = []
for i in range(1, int(df.shape[0]/2 + 1)):
    mylist.append(i)
    mylist.append(i)
print(mylist[0],mylist[-1])

In [None]:
len(range(1,int(df.shape[0]/2 + 1)))

In [None]:
df['game_id'] = mylist

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.info()

In [None]:
df = df.merge(df, on='game_id', suffixes=(None,'_2'))

In [None]:
df

In [None]:
df = df[1::4]
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
#sanity check that every game only appears once
df['game_id']

In [None]:
df['Date'] = df['Date'].astype('int')
df['Date']

In [None]:
df['Date'] = df['Date'].astype('str')
df['Date']

In [None]:
df['Date'] = df['Date'].str.strip('.0')

In [None]:
df['Date'].head

In [None]:
dates = []
for date in df['Date']:
    if int(date) > 1000:
        date = '20'+files[2][:2]+date
        dates.append(date)
    else:
        date = '20'+files[2][3:5]+'0'+date
        dates.append(date)
df['Date'] = dates

In [None]:
df['Date']

In [None]:
df

In [None]:
df['Visitor'] = df['Team']
df['Home'] = df['Team_2']
df

In [None]:
df['Open'].replace('pk','0',inplace=True)
df['Close'].replace('pk','0',inplace=True)
df['Open_2'].replace('pk','0',inplace=True)
df['Close_2'].replace('pk','0',inplace=True)
df.replace('197.5u10','197.5',inplace=True)

In [None]:
mask = df['Open'].values == 'pk'
df[mask]

In [None]:
df['Open'] = df['Open'].astype('float')
df['Open_2'] = df['Open_2'].astype('float')
df['Close'] = df['Close'].astype('float')
df['Close_2'] = df['Close_2'].astype('float')

In [None]:
df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
df

In [None]:
df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
df

Sanity check that opening and closing lines operations worked successfully.
So long as numbers are around 200, we know we are ok.

In [None]:
df['O/U_open'].min()

In [None]:
df['O/U_close'].min()

In [None]:
df = df[['Date','Home','Visitor','O/U_open','O/U_close']]
df

### Processing our lines df

First, and most importantly, we must create our target labels. This will be done by comparing the total the the opening and closing lines, and mapping that result to one of three categories: Over, Under, or Push. This will represent the winning result of that game.

In [None]:
df

In [None]:
df['ou1'] = df['total'] - df['O/U_open']
df['ou2'] = df['total'] - df['O/U_close']

In [None]:
df

In [None]:
def over_under(ou):
    if ou > 0:
        return 1
    elif ou == 0:
        return 2
    else:
        return 0

In [None]:
df['Over/Under_open'] = df['ou1'].apply(over_under)
df['Over/Under_close'] = df['ou2'].apply(over_under)

In [None]:
df

Making the decision to drop pushes as no money exchanges hands in this scenerio.

In [None]:
df = df[df['Over/Under_open'] != 2]
df = df[df['Over/Under_close'] != 2]

In [None]:
df

About 400 instances of a push, or about 2.7% of our original dataset. This number can be remembered for sampling or simulation purposes later.

### Stat Building

We have the same set of stats for both teams in any one game, so we can build offense/defense for both teams.

In [None]:
'''map stats accordingly:
1. Get 2 "sets" of stats per game:

visitor: offense - _v stat avgs heading into the game; defense - _h stat avgs heading into the game
home: offese - _h stat avgs heading into the game; defense - _v stat avgs heading into the game

2. Map visitor/home stats to respective teams

3. Build dictionary of team's seasons to be further processed.

{1415: {GSW: {..game_35:{offense/defense stats},game_36:{..}

4. Process dictionary to have more stats/potential model features:

{GSW: {..game_35:{offense/defense stats averaged through 34 games},game_36:{..}

5. ?

'''

In [None]:
df.info()

Convert date to pd.datetime object. May help with building season dictionary later.

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')

In [None]:
df['Date']

In [None]:
d = df.reindex(df.index.repeat(2)).reset_index(drop=True)

Now that we have every game twice, we can make a new column to hold home/visitor every other row, and then rename the _h and _v columns for self, opponent.

In [None]:
d

Let's create the column we want to place our values, then selectively copy.

In [None]:
d['team'] = np.nan

In [None]:
#home team will be all even indexes of this dataset
d['team'][::2] = d['home'][::2]

In [None]:
#away team will be all odd indexes of this dataset
d['team'][1::2] = d['away'][1::2]

In [None]:
d['opp'] = np.nan

In [None]:
d['opp'][::2] = d['away'][::2]
d['opp'][1::2] = d['home'][1::2]

In [None]:
d[['team','opp']]

Let's keep track of home/away, can't hurt. We'll use the same process as we just used.

In [None]:
d['home/away'] = np.nan
#home team will be all even indexes of this dataset
d['home/away'][::2] = 'H'
#away team will be all odd indexes of this dataset
d['home/away'][1::2] = 'V'

In [None]:
d.info()

Now we have to map the _v and _h stats appropriate to new columns for self or opponent stats.

In [None]:
stats = ['pace','eFg','tov','orb','ft_fga','ortg','fg','fga','fg_per','threes','threes_att','threes_per',
         'ft','fta','ft_per','drb','trb','ast','stl','blk','to','fouls','ts_per','threes_ar','ft_ar',
         'drb_per','trb_per','ast_per','stl_per','blk_per','user_per','drtg']
for stat in stats:
    d['{}'.format(stat)] = ""
    d['{}'.format(stat)][::2] = d['{}_h'.format(stat)][::2]
    d['{}'.format(stat)][1::2] = d['{}_v'.format(stat)][1::2]
    d['{}_opp'.format(stat)] = ""
    d['{}_opp'.format(stat)][::2] = d['{}_v'.format(stat)][::2]
    d['{}_opp'.format(stat)][1::2] = d['{}_h'.format(stat)][1::2]

In [None]:
mylist = []
for stat in stats:
    mylist.append('{}_v'.format(stat))
    mylist.append('{}_h'.format(stat))
#print(mylist)
d = d.drop(columns=mylist)

In [None]:
d.info()

Now our DataFrame (d) has stats for each team and their opponent for every game. Now let's get some rolling averages.

First we'll create an empty column that we'll populate with a rolling count for every time that team has appeared. This will get us the number of games each team has played up INCLUDING that game for each season. We can use that info to incorporate some rolling averages.

In [None]:
d['team_season'] = d['team'] + d['Season']

In [None]:
d

In [None]:
d['game_num'] = d.groupby('team_season').cumcount()+1
d

In [None]:
d.info()

Cast our stat columns as floats to perform some operations on them.

In [None]:
for col in d.iloc[:,15:79].columns:
    d[col] = d[col].astype('float')

In [None]:
d.info()

In [None]:
d['team_season_game_num'] = d['team_season'] + d['game_num'].astype('str')

In [None]:
d['team_season_game_num']

In [None]:
for stat in d.iloc[:,15:79]:
    d['{}_rolling'.format(stat)] = np.nan

In [None]:
d.info()

In [None]:
for team in d['team'].unique():
    mask = d['team'] == team
    d4 = d[mask]
    for season in d4['Season'].unique():
        mask = d4['Season'] == '{}'.format(season)
        d5 = d4[mask]
        for stat in list(d5.iloc[:,15:79].columns):
            d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)
        d.update(d5)

In [None]:
d[d['game_num']>5]

For each team, get a single season, calculate rolling averages, then insert into our DataFrame (d). Necessary since every season has potentially different number of games

In [None]:
for season in d4['Season'].unique():
    mask = d4['Season'] == '{}'.format(season)
    d5 = d4[mask]
    d5
    for stat in list(d5.iloc[:,15:79].columns):
        d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)
    #print(len(d4[d4['Season'] == '{}'.format(season)]))

In [None]:
d5.columns[82:146]

In [None]:
d5.index.values

In [None]:
d5

In [None]:
d.update(d5)

In [None]:
d.iloc[28947:28970,:]

In [None]:
stat_list = list(d3.iloc[:,5:].columns)

In [None]:
for stat in stat_list:
    d3['{}_rolling'.format(stat)] = d3.rolling(window=5)['{}'.format(stat)].mean().shift(1)

In [None]:
d3

## Scratch Work Graveyard

Processing to merge datasets graveyard

In [None]:
#tried self-merging on game_id, but this is only applicable if in different columns
#df1 = df1.merge(
#            right=df1[opp_pull_cols],
#            left_on=["game_id", "team"],
#            right_on=["game_id", "opp"],
#            suffixes=[None, "_opp"],
#        )

In [None]:
#tried to use drop_duplicates method but can only keep first or last
#df.drop_duplicates(subset=['game_id'], keep='second')

Rolling average graveyard

In [None]:
#found online, talks about rolling average based on multiple columns, never tried
#df.loc[:, 'value_sma_10'] = df.groupby(by='object')[['object', 'period']].rolling(window=10, min_periods=1, on='period').mean().reset_index(level='object')['value']

In [None]:
#found online, got it to work, but not quite applicable to this situation
#span = 5
#sma = d2.rolling(window=span, min_periods=span).mean()[:span]
#rest = d2[span:]
#pd.concat([sma, rest]).ewm(span=span, adjust=False).mean()

In [None]:
#didn't work - ValueError: cannot reindex from a duplicate axis
#d2['eFg_rolling'] = d2.groupby(['team_season','game_num'])['eFg'].rolling(10).mean().droplevel(level=[0])

In [None]:
#df1 = d.copy()
#df1

In [None]:
#df1 = d.groupby(['team_season','game_num']).rolling(5)['eFg'].mean().reset_index(drop=True)

In [None]:
#df1['pace_rolling'] = d.groupby(['team_season','game_num'])[5:,'pace'].transform(lambda x: x.rolling(10, 10).mean())