In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Usable Code

In [2]:
!pwd

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [3]:
files = sorted([f for f in os.listdir('csv') if '.csv' in f])

In [4]:
files

['07-08.csv',
 '08-09.csv',
 '09-10.csv',
 '10-11.csv',
 '11-12.csv',
 '12-13.csv',
 '13-14.csv',
 '14-15.csv',
 '15-16.csv',
 '16-17.csv',
 '17-18.csv',
 '18-19.csv']

In [170]:
def clean_odds(file):
    '''
    cleans csv file and returns dataframe object that contains the over/under lines (target)
    '''
    df = pd.read_csv(file)
    df = df.iloc[:,:13] #some files contained unnamed extra rows
    df.dropna(axis=0,inplace=True) #and others columns
    
    #makes a list of game_id's equivalent to how many games were played that season
    mylist = []
    for i in range(1, int(df.shape[0]/2 + 1)):
        mylist.append(i)
        mylist.append(i)
    df['game_id'] = mylist #maps game_id's to games (spread across 2 rows)
    
    df = df.merge(df, on='game_id', suffixes=(None,'_2')) #gets games to be in one row (creates 4/game)
    df = df[1::4] #gets correct mapping
    df.reset_index(drop=True, inplace=True)
    
    #make dates include year so we can join with another dataset later
    df['Date'] = df['Date'].astype('int')
    df['Date'] = df['Date'].astype('str')
    #df['Date'] = df['Date'].str.strip('.0')
    dates = []
    for date in df['Date']:
        if int(date) > 1000:
            date = '20'+file[:2]+date #GET FILENAMES AND USE HERE
            dates.append(date)
        else:
            date = '20'+file[3:5]+'0'+date #GET FILENAMES AND USE HERE
            dates.append(date)
    df['Date'] = dates
    
    #take out pick-em's and replace with zero for later transformation
    df['Open'].replace(['PK','pk'],'0',inplace=True)
    df['Close'].replace(['PK','pk'],'0',inplace=True)
    df['Open_2'].replace(['PK','pk'],'0',inplace=True)
    df['Close_2'].replace(['PK','pk'],'0',inplace=True)
    df.replace('197.5u10','197.5',inplace=True) #one unique occurrence
    #map strings as floats for comparison
    df['Open'] = df['Open'].astype('float')
    df['Open_2'] = df['Open_2'].astype('float')
    df['Close'] = df['Close'].astype('float')
    df['Close_2'] = df['Close_2'].astype('float')
    #get correct over/under line at opening and closing of sportsbook
    df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
    df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
    
    #rename columns to correct home/visitor 
    df['Visitor'] = df['Team']
    df['Home'] = df['Team_2']
    
    #add season column
    df['Season'] = file[:2] + file[3:5]
    
    #drop unneeded info
    df = df[['Date','Home','Visitor','O/U_open','O/U_close','Season']]
    return df

Use this clean_odds function to generate our dataset to combine with our scraped game data.

In [165]:
for file in files:
    print(file[:2]+file[3:5])

0708
0809
0910
1011
1112
1213
1314
1415
1516
1617
1718
1819


In [171]:
cd csv

[Errno 2] No such file or directory: 'csv'
/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor/csv


In [172]:
totals = pd.concat([clean_odds(f) for f in files])

In [173]:
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season
0,20071030,SanAntonio,Portland,184.0,189.5,0708
1,20071030,GoldenState,Utah,214.5,212.0,0708
2,20071030,LALakers,Houston,191.0,199.0,0708
3,20071031,Toronto,Philadelphia,190.0,191.0,0708
4,20071031,Indiana,Washington,200.0,203.5,0708
...,...,...,...,...,...,...
1307,20190602,Toronto,GoldenState,216.0,213.5,1819
1308,20190605,GoldenState,Toronto,214.0,209.5,1819
1309,20190607,GoldenState,Toronto,216.0,215.0,1819
1310,20190610,Toronto,GoldenState,212.0,217.0,1819


In [174]:
totals.reset_index(drop=True,inplace=True)
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season
0,20071030,SanAntonio,Portland,184.0,189.5,0708
1,20071030,GoldenState,Utah,214.5,212.0,0708
2,20071030,LALakers,Houston,191.0,199.0,0708
3,20071031,Toronto,Philadelphia,190.0,191.0,0708
4,20071031,Indiana,Washington,200.0,203.5,0708
...,...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5,1819
15516,20190605,GoldenState,Toronto,214.0,209.5,1819
15517,20190607,GoldenState,Toronto,216.0,215.0,1819
15518,20190610,Toronto,GoldenState,212.0,217.0,1819


Let's pickle this initial processing of the csv's and subsequent DataFrame. We can always comeback to this if we need it.

In [175]:
with open('lines.pickle', 'wb') as to_write:
    pickle.dump(totals, to_write)

In [14]:
pwd

'/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor/csv'

In [15]:
cd ..

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [16]:
with open('game_df.pickle','rb') as read_file:
    game_df = pickle.load(read_file)
    
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.538,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.512,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.485,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.552,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.577,.301,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.493,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.523,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.527,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.542,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4


In [19]:
team_abbrev = game_df['home'].unique()
team_abbrev

array(['SAS', 'GSW', 'LAL', 'IND', 'ORL', 'TOR', 'NJN', 'CLE', 'MEM',
       'NOH', 'DEN', 'MIA', 'UTA', 'SEA', 'CHA', 'ATL', 'BOS', 'MIN',
       'CHI', 'LAC', 'PHO', 'PHI', 'WAS', 'DAL', 'HOU', 'MIL', 'NYK',
       'DET', 'SAC', 'POR', 'OKC', 'BRK', 'NOP', 'CHO'], dtype=object)

In [20]:
len(team_abbrev)

34

In [21]:
totals['Home'].unique()

array(['SanAntonio', 'GoldenState', 'LALakers', 'Toronto', 'Indiana',
       'Orlando', 'NewJersey', 'Cleveland', 'Memphis', 'NewOrleans',
       'Denver', 'Miami', 'Utah', 'Seattle', 'Charlotte', 'Atlanta',
       'Boston', 'Minnesota', 'Chicago', 'Phoenix', 'LAClippers',
       'Philadelphia', 'Washington', 'Milwaukee', 'Houston', 'Dallas',
       'NewYork', 'Detroit', 'Sacramento', 'Portland', 'OklahomaCity',
       'Brooklyn'], dtype=object)

In [22]:
team_dict = {
    'SAS': 'SanAntonio',
    'GSW': 'GoldenState',
    'LAL': 'LALakers',
    'TOR': 'Toronto',
    'IND': 'Indiana',
    'ORL': 'Orlando',
    'NJN': 'NewJersey',
    'CLE': 'Cleveland',
    'MEM': 'Memphis',
    'NOH': 'NewOrleans',
    'NOP': 'NewOrleans',
    'DEN': 'Denver',
    'MIA': 'Miami',
    'UTA': 'Utah',
    'SEA': 'Seattle',
    'CHA': 'Charlotte',
    'CHO': 'Charlotte',
    'ATL': 'Atlanta',
    'BOS': 'Boston',
    'MIN': 'Minnesota',
    'CHI': 'Chicago',
    'PHO': 'Phoenix',
    'LAC': 'LAClippers',
    'PHI': 'Philadelphia',
    'WAS': 'Washington',
    'MIL': 'Milwaukee',
    'HOU': 'Houston',
    'DAL': 'Dallas',
    'NYK': 'NewYork',
    'DET': 'Detroit',
    'SAC': 'Sacramento',
    'POR': 'Portland',
    'OKC': 'OklahomaCity',
    'BRK': 'Brooklyn'
}

In [23]:
game_df['home'].map(team_dict)

0         SanAntonio
1        GoldenState
2           LALakers
3            Indiana
4            Orlando
            ...     
15516        Toronto
15517    GoldenState
15518    GoldenState
15519        Toronto
15520    GoldenState
Name: home, Length: 15521, dtype: object

In [24]:
game_df['home_team'] = game_df['home'].map(team_dict)

In [25]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.301,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto


In [26]:
game_df['game_date'] = game_df['game_id'].apply(lambda x: x[:8])

In [27]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando,20071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610


In [28]:
game_df['id'] = game_df['game_date'] + game_df['home_team']

In [29]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date,id
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030,20071030SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030,20071030GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030,20071030LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031,20071031Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando,20071031,20071031Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602,20190602Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605,20190605GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607,20190607GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610,20190610Toronto


In [176]:
totals['id'] = totals['Date'] + totals['Home']

In [177]:
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season,id
0,20071030,SanAntonio,Portland,184.0,189.5,0708,20071030SanAntonio
1,20071030,GoldenState,Utah,214.5,212.0,0708,20071030GoldenState
2,20071030,LALakers,Houston,191.0,199.0,0708,20071030LALakers
3,20071031,Toronto,Philadelphia,190.0,191.0,0708,20071031Toronto
4,20071031,Indiana,Washington,200.0,203.5,0708,20071031Indiana
...,...,...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5,1819,20190602Toronto
15516,20190605,GoldenState,Toronto,214.0,209.5,1819,20190605GoldenState
15517,20190607,GoldenState,Toronto,216.0,215.0,1819,20190607GoldenState
15518,20190610,Toronto,GoldenState,212.0,217.0,1819,20190610Toronto


In [178]:
totals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15520 entries, 0 to 15519
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       15520 non-null  object 
 1   Home       15520 non-null  object 
 2   Visitor    15520 non-null  object 
 3   O/U_open   15520 non-null  float64
 4   O/U_close  15520 non-null  float64
 5   Season     15520 non-null  object 
 6   id         15520 non-null  object 
dtypes: float64(2), object(5)
memory usage: 848.9+ KB


In [33]:
game_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15521 entries, 0 to 15520
Data columns (total 71 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   game_id       15521 non-null  object
 1   total         15521 non-null  int64 
 2   away          15521 non-null  object
 3   home          15521 non-null  object
 4   pace_v        15521 non-null  object
 5   pace_h        15521 non-null  object
 6   eFg_v         15521 non-null  object
 7   eFg_h         15521 non-null  object
 8   tov_v         15521 non-null  object
 9   tov_h         15521 non-null  object
 10  orb_v         15521 non-null  object
 11  orb_h         15521 non-null  object
 12  ft_fga_v      15521 non-null  object
 13  ft_fga_h      15521 non-null  object
 14  ortg_v        15521 non-null  object
 15  ortg_h        15521 non-null  object
 16  fg_v          15521 non-null  object
 17  fga_v         15521 non-null  object
 18  fg_per_v      15521 non-null  object
 19  thre

Now both databases have a column to merge on. The 'id' column.

In [179]:
df = pd.merge(totals,game_df,on='id')

In [180]:
df

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,Season,id,game_id,total,away,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date
0,20071030,SanAntonio,Portland,184.0,189.5,0708,20071030SanAntonio,200710300SAS,203,POR,...,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030
1,20071030,GoldenState,Utah,214.5,212.0,0708,20071030GoldenState,200710300GSW,213,UTA,...,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030
2,20071030,LALakers,Houston,191.0,199.0,0708,20071030LALakers,200710300LAL,188,HOU,...,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030
3,20071031,Toronto,Philadelphia,190.0,191.0,0708,20071031Toronto,200710310TOR,203,PHI,...,.200,64.3,44.6,56.1,8.8,6.2,100.0,106.8,Toronto,20071031
4,20071031,Indiana,Washington,200.0,203.5,0708,20071031Indiana,200710310IND,229,WAS,...,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,Toronto,GoldenState,216.0,213.5,1819,20190602Toronto,201906020TOR,213,GSW,...,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602
15011,20190605,GoldenState,Toronto,214.0,209.5,1819,20190605GoldenState,201906050GSW,232,TOR,...,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605
15012,20190607,GoldenState,Toronto,216.0,215.0,1819,20190607GoldenState,201906070GSW,197,TOR,...,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607
15013,20190610,Toronto,GoldenState,212.0,217.0,1819,20190610Toronto,201906100TOR,211,GSW,...,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610


In [181]:
df.shape

(15015, 77)

Drop columns that share the same information.

In [182]:
df.drop(columns=['Home','Visitor','game_id','home_team','game_date'],inplace=True)

In [183]:
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,pace_v,pace_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,20071030,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,.538,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7
1,20071030,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,.512,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3
2,20071030,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,93.0,93.0,...,.485,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1
3,20071031,190.0,191.0,0708,20071031Toronto,203,PHI,TOR,90.9,90.9,...,.573,.188,.200,64.3,44.6,56.1,8.8,6.2,100.0,106.8
4,20071031,200.0,203.5,0708,20071031Indiana,229,WAS,IND,98.4,98.4,...,.552,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,216.0,213.5,1819,20190602Toronto,213,GSW,TOR,100.0,100.0,...,.493,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0
15011,20190605,214.0,209.5,1819,20190605GoldenState,232,TOR,GSW,99.6,99.6,...,.523,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5
15012,20190607,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,.527,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7
15013,20190610,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,.542,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4


Getting the difference between the line and the outcome of a particular game in order to classify our game as an "Over" or "Under".

In [None]:
df['ou1'] = df['total'] - df['O/U_open']
df['ou2'] = df['total'] - df['O/U_close']

Function to label our target based off information we have in our DataFrame, and then applying that function to our newly created columns that currently classify our games based on a positive or negative value. This simply puts a more general categorical label over that more granular scalar indicator.

In [None]:
def over_under(ou):
    if ou > 0:
        return 1
    elif ou == 0:
        return 2
    else:
        return 0

In [None]:
df['Over/Under_open'] = df['ou1'].apply(over_under)
df['Over/Under_close'] = df['ou2'].apply(over_under)

CRITICAL ASSUMPTION: Making the decision to drop "pushes". A "push" is when a betting line is hit exactly, and no money exchanges hands. In the context of this project, a total could be at 212 points, and if the game finishes with exactly 212 points, then every bettor gets their money back, on both sides (over/under 212), and the sportsbook doesn't collect anything.

In [None]:
df = df[df['Over/Under_open'] != 2]
df = df[df['Over/Under_close'] != 2]

About 400 instances of a "push", or about 2.7% of our original dataset. This number can be remembered for sampling or simulation purposes later.

Convert date to pd.datetime object. Makes plotting the time series compatible with matplotlib, and building season dictionary later.

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')

### Stat building (feature engineering)

We want to ultimately end up with a model that takes both teams' recent performances and can make a prediction on the total, so we can make a new column to hold home/visitor every other row, and then rename the _h and _v columns for self, opponent.

In [None]:
d = df.reindex(df.index.repeat(2)).reset_index(drop=True)

Now that we have every game repeated, let's create a column to indicate which team these self/opponent stats represent. We can do this by creating an empty column and selectively copy from the 'home' and 'away' columns that repeat across the 2 rows. Then we'll do that same for the opponent.

In [None]:
d['team'] = np.nan
#home team will be all even indexes of this dataset
d['team'][::2] = d['home'][::2]
#away team will be all odd indexes of this dataset
d['team'][1::2] = d['away'][1::2]

In [None]:
d['opp'] = np.nan
d['opp'][::2] = d['away'][::2]
d['opp'][1::2] = d['home'][1::2]

Let's keep track of home/away, can't hurt. We'll use the same process as we just used. Note: this can also be used as a sanity check to make sure that our splitting worked correctly. If it did, we should see 'H', 'V' every other row.

In [None]:
d['home/away'] = np.nan
#home team will be all even indexes of this dataset
d['home/away'][::2] = 'H'
#away team will be all odd indexes of this dataset
d['home/away'][1::2] = 'V'

Now we have to map the _v and _h stats appropriate to new columns for self or opponent stats.

In [None]:
stats = ['pace','eFg','tov','orb','ft_fga','ortg','fg','fga','fg_per','threes','threes_att','threes_per',
         'ft','fta','ft_per','drb','trb','ast','stl','blk','to','fouls','ts_per','threes_ar','ft_ar',
         'drb_per','trb_per','ast_per','stl_per','blk_per','user_per','drtg']
for stat in stats:
    d['{}'.format(stat)] = ""
    d['{}'.format(stat)][::2] = d['{}_h'.format(stat)][::2]
    d['{}'.format(stat)][1::2] = d['{}_v'.format(stat)][1::2]
    d['{}_opp'.format(stat)] = ""
    d['{}_opp'.format(stat)][::2] = d['{}_v'.format(stat)][::2]
    d['{}_opp'.format(stat)][1::2] = d['{}_h'.format(stat)][1::2]

Then we can drop all of our columns with '_v' & '_h' since they contain extra, and now, redundant information.

In [None]:
mylist = []
for stat in stats:
    mylist.append('{}_v'.format(stat))
    mylist.append('{}_h'.format(stat))
#print(mylist)
d = d.drop(columns=mylist)

Now our DataFrame (d) has stats for each team and their opponent for every game. Now let's get some rolling averages. First we'll create an empty column that we'll populate with a rolling count for every time that team has appeared. This will get us the number of games each team has played up INCLUDING that game for each season. We can use that info to incorporate some rolling averages.

In [None]:
d['team_season'] = d['team'] + d['Season']

In [None]:
d['game_num'] = d.groupby('team_season').cumcount()+1

Now we are ready to compute some rolling average's in a certain season. Remember, we want to get a snapshot of how both team's have perfromed recently (last 5 games), and get a classification from this info. So let's go back to our method of creating empty columns we can assign data to.

In [None]:
for stat in d.iloc[:,15:79]:
    d['{}_rolling'.format(stat)] = np.nan

Cast our stat columns as floats to perform some operations on them.

In [None]:
for col in d.iloc[:,15:79].columns:
    d[col] = d[col].astype('float')

This may not be the most pythonic way to populate these new columns, but it works, and doesn't take long. At a high level, we are going team by team, then season by season, for that team, and applying 5 game rolling averages to their stats and opponent stats.

In [None]:
for team in d['team'].unique():
    mask = d['team'] == team
    d4 = d[mask]
    for season in d4['Season'].unique():
        mask = d4['Season'] == '{}'.format(season)
        d5 = d4[mask]
        for stat in list(d5.iloc[:,15:79].columns):
            d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)
        d.update(d5)

IMPORTANT NOTE: this DataFrame contains rows with NaN's. Every game that is in the first 5 of the season has no rolling averages, this can be problematic when we take this DataFrame and try to feed it into a sk-learn ML model. We can address this now, by simply dropping these games(~5% of our dataset), or apply the previous season's median rolling average. This is where some decisions need to be made, and I would advise just dropping the games for 2 reasons. One, taking out 5 games won't change the application our model will have as there are 67-75 games left in the year we can apply this model to once in production. Two, a median of the rolling averages, or any other kind of congregation statistic being applied over different seasons is mostly likely not a sound choice in light of team personnel turnover, from retirings, trades, the draft, and free agent moves; not to mention new coaching staffs. This could be applicable to a few teams (ones that experience little of this roster turnover), but not enough to apply that thinking across the entire dataset.

In [None]:
d[d['game_num']>5]

Now that our data is fully processed, we are ready to process this in a classification algorithm

In [None]:
with open('df.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

## Scratch Work

In [None]:
df = pd.read_csv(files[2])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.iloc[:,:13]

In [None]:
df.info()

In [None]:
pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 101)

In [None]:
df.shape[0]

In [None]:
mylist = []
for i in range(1, int(df.shape[0]/2 + 1)):
    mylist.append(i)
    mylist.append(i)
print(mylist[0],mylist[-1])

In [None]:
len(range(1,int(df.shape[0]/2 + 1)))

In [None]:
df['game_id'] = mylist

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.info()

In [None]:
df = df.merge(df, on='game_id', suffixes=(None,'_2'))

In [None]:
df

In [None]:
df = df[1::4]
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
#sanity check that every game only appears once
df['game_id']

In [None]:
df['Date'] = df['Date'].astype('int')
df['Date']

In [None]:
df['Date'] = df['Date'].astype('str')
df['Date']

In [None]:
df['Date'] = df['Date'].str.strip('.0')

In [None]:
df['Date'].head

In [None]:
dates = []
for date in df['Date']:
    if int(date) > 1000:
        date = '20'+files[2][:2]+date
        dates.append(date)
    else:
        date = '20'+files[2][3:5]+'0'+date
        dates.append(date)
df['Date'] = dates

In [None]:
df['Date']

In [None]:
df

In [None]:
df['Visitor'] = df['Team']
df['Home'] = df['Team_2']
df

In [None]:
df['Open'].replace('pk','0',inplace=True)
df['Close'].replace('pk','0',inplace=True)
df['Open_2'].replace('pk','0',inplace=True)
df['Close_2'].replace('pk','0',inplace=True)
df.replace('197.5u10','197.5',inplace=True)

In [None]:
mask = df['Open'].values == 'pk'
df[mask]

In [None]:
df['Open'] = df['Open'].astype('float')
df['Open_2'] = df['Open_2'].astype('float')
df['Close'] = df['Close'].astype('float')
df['Close_2'] = df['Close_2'].astype('float')

In [None]:
df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
df

In [None]:
df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
df

Sanity check that opening and closing lines operations worked successfully.
So long as numbers are around 200, we know we are ok.

In [None]:
df['O/U_open'].min()

In [None]:
df['O/U_close'].min()

In [None]:
df = df[['Date','Home','Visitor','O/U_open','O/U_close']]
df

### Processing our lines df

First, and most importantly, we must create our target labels. This will be done by comparing the total the the opening and closing lines, and mapping that result to one of three categories: Over, Under, or Push. This will represent the winning result of that game.

In [184]:
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,pace_v,pace_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,20071030,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,.538,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7
1,20071030,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,.512,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3
2,20071030,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,93.0,93.0,...,.485,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1
3,20071031,190.0,191.0,0708,20071031Toronto,203,PHI,TOR,90.9,90.9,...,.573,.188,.200,64.3,44.6,56.1,8.8,6.2,100.0,106.8
4,20071031,200.0,203.5,0708,20071031Indiana,229,WAS,IND,98.4,98.4,...,.552,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,216.0,213.5,1819,20190602Toronto,213,GSW,TOR,100.0,100.0,...,.493,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0
15011,20190605,214.0,209.5,1819,20190605GoldenState,232,TOR,GSW,99.6,99.6,...,.523,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5
15012,20190607,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,.527,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7
15013,20190610,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,.542,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4


In [185]:
df['ou1'] = df['total'] - df['O/U_open']
df['ou2'] = df['total'] - df['O/U_close']

In [186]:
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,pace_v,pace_h,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,ou1,ou2
0,20071030,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,19.0,13.5
1,20071030,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,-1.5,1.0
2,20071030,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,93.0,93.0,...,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,-3.0,-11.0
3,20071031,190.0,191.0,0708,20071031Toronto,203,PHI,TOR,90.9,90.9,...,.200,64.3,44.6,56.1,8.8,6.2,100.0,106.8,13.0,12.0
4,20071031,200.0,203.5,0708,20071031Indiana,229,WAS,IND,98.4,98.4,...,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,29.0,25.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,216.0,213.5,1819,20190602Toronto,213,GSW,TOR,100.0,100.0,...,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,-3.0,-0.5
15011,20190605,214.0,209.5,1819,20190605GoldenState,232,TOR,GSW,99.6,99.6,...,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,18.0,22.5
15012,20190607,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,-19.0,-18.0
15013,20190610,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,-1.0,-6.0


In [187]:
def over_under(ou):
    if ou > 0:
        return 1
    elif ou == 0:
        return 2
    else:
        return 0

In [188]:
df['Over/Under_open'] = df['ou1'].apply(over_under)
df['Over/Under_close'] = df['ou2'].apply(over_under)

In [189]:
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,pace_v,pace_h,...,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,ou1,ou2,Over/Under_open,Over/Under_close
0,20071030,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,50.0,51.2,8.7,6.2,100.0,105.7,19.0,13.5,1,1
1,20071030,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,39.8,59.4,7.6,11.4,100.0,111.3,-1.5,1.0,0,1
2,20071030,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,93.0,93.0,...,43.0,56.3,17.2,5.8,100.0,102.1,-3.0,-11.0,0,0
3,20071031,190.0,191.0,0708,20071031Toronto,203,PHI,TOR,90.9,90.9,...,44.6,56.1,8.8,6.2,100.0,106.8,13.0,12.0,1,1
4,20071031,200.0,203.5,0708,20071031Indiana,229,WAS,IND,98.4,98.4,...,50.0,52.6,11.0,7.6,100.0,101.2,29.0,25.5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,216.0,213.5,1819,20190602Toronto,213,GSW,TOR,100.0,100.0,...,53.8,48.6,8.0,4.2,100.0,109.0,-3.0,-0.5,0,0
15011,20190605,214.0,209.5,1819,20190605GoldenState,232,TOR,GSW,99.6,99.6,...,50.6,69.4,8.0,6.8,100.0,123.5,18.0,22.5,1,1
15012,20190607,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,51.9,74.3,6.3,11.1,100.0,110.7,-19.0,-18.0,0,0
15013,20190610,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,53.8,50.0,6.4,12.5,100.0,112.4,-1.0,-6.0,0,0


Making the decision to drop pushes as no money exchanges hands in this scenerio.

In [190]:
df = df[df['Over/Under_open'] != 2]
df = df[df['Over/Under_close'] != 2]

In [191]:
df

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,pace_v,pace_h,...,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,ou1,ou2,Over/Under_open,Over/Under_close
0,20071030,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,50.0,51.2,8.7,6.2,100.0,105.7,19.0,13.5,1,1
1,20071030,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,39.8,59.4,7.6,11.4,100.0,111.3,-1.5,1.0,0,1
2,20071030,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,93.0,93.0,...,43.0,56.3,17.2,5.8,100.0,102.1,-3.0,-11.0,0,0
3,20071031,190.0,191.0,0708,20071031Toronto,203,PHI,TOR,90.9,90.9,...,44.6,56.1,8.8,6.2,100.0,106.8,13.0,12.0,1,1
4,20071031,200.0,203.5,0708,20071031Indiana,229,WAS,IND,98.4,98.4,...,50.0,52.6,11.0,7.6,100.0,101.2,29.0,25.5,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,216.0,213.5,1819,20190602Toronto,213,GSW,TOR,100.0,100.0,...,53.8,48.6,8.0,4.2,100.0,109.0,-3.0,-0.5,0,0
15011,20190605,214.0,209.5,1819,20190605GoldenState,232,TOR,GSW,99.6,99.6,...,50.6,69.4,8.0,6.8,100.0,123.5,18.0,22.5,1,1
15012,20190607,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,51.9,74.3,6.3,11.1,100.0,110.7,-19.0,-18.0,0,0
15013,20190610,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,53.8,50.0,6.4,12.5,100.0,112.4,-1.0,-6.0,0,0


About 400 instances of a push, or about 2.7% of our original dataset. This number can be remembered for sampling or simulation purposes later.

### Stat Building

We have the same set of stats for both teams in any one game, so we can build offense/defense for both teams.

In [None]:
'''map stats accordingly:
1. Get 2 "sets" of stats per game:

visitor: offense - _v stat avgs heading into the game; defense - _h stat avgs heading into the game
home: offese - _h stat avgs heading into the game; defense - _v stat avgs heading into the game

2. Map visitor/home stats to respective teams

3. Build dictionary of team's seasons to be further processed.

{1415: {GSW: {..game_35:{offense/defense stats},game_36:{..}

4. Process dictionary to have more stats/potential model features:

{GSW: {..game_35:{offense/defense stats averaged through 34 games},game_36:{..}

5. ?

'''

In [192]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14622 entries, 0 to 15014
Data columns (total 76 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Date              14622 non-null  object 
 1   O/U_open          14622 non-null  float64
 2   O/U_close         14622 non-null  float64
 3   Season            14622 non-null  object 
 4   id                14622 non-null  object 
 5   total             14622 non-null  int64  
 6   away              14622 non-null  object 
 7   home              14622 non-null  object 
 8   pace_v            14622 non-null  object 
 9   pace_h            14622 non-null  object 
 10  eFg_v             14622 non-null  object 
 11  eFg_h             14622 non-null  object 
 12  tov_v             14622 non-null  object 
 13  tov_h             14622 non-null  object 
 14  orb_v             14622 non-null  object 
 15  orb_h             14622 non-null  object 
 16  ft_fga_v          14622 non-null  object

Convert date to pd.datetime object. May help with building season dictionary later.

In [201]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')

In [202]:
df['Date']

0       2007-10-30
1       2007-10-30
2       2007-10-30
3       2007-10-31
4       2007-10-31
           ...    
15010   2019-06-02
15011   2019-06-05
15012   2019-06-07
15013   2019-06-10
15014   2019-06-13
Name: Date, Length: 14622, dtype: datetime64[ns]

In [461]:
d = df.reindex(df.index.repeat(2)).reset_index(drop=True)

Now that we have every game twice, we can make a new column to hold home/visitor every other row, and then rename the _h and _v columns for self, opponent.

In [462]:
d

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,pace_v,pace_h,...,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,ou1,ou2,Over/Under_open,Over/Under_close
0,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,50.0,51.2,8.7,6.2,100.0,105.7,19.0,13.5,1,1
1,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,50.0,51.2,8.7,6.2,100.0,105.7,19.0,13.5,1,1
2,2007-10-30,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,39.8,59.4,7.6,11.4,100.0,111.3,-1.5,1.0,0,1
3,2007-10-30,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,39.8,59.4,7.6,11.4,100.0,111.3,-1.5,1.0,0,1
4,2007-10-30,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,93.0,93.0,...,43.0,56.3,17.2,5.8,100.0,102.1,-3.0,-11.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,51.9,74.3,6.3,11.1,100.0,110.7,-19.0,-18.0,0,0
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,53.8,50.0,6.4,12.5,100.0,112.4,-1.0,-6.0,0,0
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,94.3,94.3,...,53.8,50.0,6.4,12.5,100.0,112.4,-1.0,-6.0,0,0
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224,TOR,GSW,94.6,94.6,...,51.9,71.8,9.5,12.2,100.0,120.5,13.0,12.5,1,1


Let's create the column we want to place our values, then selectively copy.

In [463]:
d['team'] = np.nan

In [464]:
#home team will be all even indexes of this dataset
d['team'][::2] = d['home'][::2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['team'][::2] = d['home'][::2]


In [465]:
#away team will be all odd indexes of this dataset
d['team'][1::2] = d['away'][1::2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['team'][1::2] = d['away'][1::2]


In [466]:
d['opp'] = np.nan

In [467]:
d['opp'][::2] = d['away'][::2]
d['opp'][1::2] = d['home'][1::2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['opp'][::2] = d['away'][::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['opp'][1::2] = d['home'][1::2]


In [468]:
d[['team','opp']]

Unnamed: 0,team,opp
0,SAS,POR
1,POR,SAS
2,GSW,UTA
3,UTA,GSW
4,LAL,HOU
...,...,...
29239,TOR,GSW
29240,TOR,GSW
29241,GSW,TOR
29242,GSW,TOR


Let's keep track of home/away, can't hurt. We'll use the same process as we just used.

In [469]:
d['home/away'] = np.nan
#home team will be all even indexes of this dataset
d['home/away'][::2] = 'H'
#away team will be all odd indexes of this dataset
d['home/away'][1::2] = 'V'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['home/away'][::2] = 'H'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['home/away'][1::2] = 'V'


In [470]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29244 entries, 0 to 29243
Data columns (total 79 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              29244 non-null  datetime64[ns]
 1   O/U_open          29244 non-null  float64       
 2   O/U_close         29244 non-null  float64       
 3   Season            29244 non-null  object        
 4   id                29244 non-null  object        
 5   total             29244 non-null  int64         
 6   away              29244 non-null  object        
 7   home              29244 non-null  object        
 8   pace_v            29244 non-null  object        
 9   pace_h            29244 non-null  object        
 10  eFg_v             29244 non-null  object        
 11  eFg_h             29244 non-null  object        
 12  tov_v             29244 non-null  object        
 13  tov_h             29244 non-null  object        
 14  orb_v             2924

Now we have to map the _v and _h stats appropriate to new columns for self or opponent stats.

In [471]:
stats = ['pace','eFg','tov','orb','ft_fga','ortg','fg','fga','fg_per','threes','threes_att','threes_per',
         'ft','fta','ft_per','drb','trb','ast','stl','blk','to','fouls','ts_per','threes_ar','ft_ar',
         'drb_per','trb_per','ast_per','stl_per','blk_per','user_per','drtg']
for stat in stats:
    d['{}'.format(stat)] = ""
    d['{}'.format(stat)][::2] = d['{}_h'.format(stat)][::2]
    d['{}'.format(stat)][1::2] = d['{}_v'.format(stat)][1::2]
    d['{}_opp'.format(stat)] = ""
    d['{}_opp'.format(stat)][::2] = d['{}_v'.format(stat)][::2]
    d['{}_opp'.format(stat)][1::2] = d['{}_h'.format(stat)][1::2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['{}'.format(stat)][::2] = d['{}_h'.format(stat)][::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['{}'.format(stat)][1::2] = d['{}_v'.format(stat)][1::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d['{}_opp'.format(stat)][::2] = d['{}_v'.format(stat)][::2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-c

In [472]:
mylist = []
for stat in stats:
    mylist.append('{}_v'.format(stat))
    mylist.append('{}_h'.format(stat))
#print(mylist)
d = d.drop(columns=mylist)

In [473]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29244 entries, 0 to 29243
Data columns (total 79 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              29244 non-null  datetime64[ns]
 1   O/U_open          29244 non-null  float64       
 2   O/U_close         29244 non-null  float64       
 3   Season            29244 non-null  object        
 4   id                29244 non-null  object        
 5   total             29244 non-null  int64         
 6   away              29244 non-null  object        
 7   home              29244 non-null  object        
 8   ou1               29244 non-null  float64       
 9   ou2               29244 non-null  float64       
 10  Over/Under_open   29244 non-null  int64         
 11  Over/Under_close  29244 non-null  int64         
 12  team              29244 non-null  object        
 13  opp               29244 non-null  object        
 14  home/away         2924

Now our DataFrame (d) has stats for each team and their opponent for every game. Now let's get some rolling averages.

First we'll create an empty column that we'll populate with a rolling count for every time that team has appeared. This will get us the number of games each team has played up INCLUDING that game for each season. We can use that info to incorporate some rolling averages.

In [474]:
d['team_season'] = d['team'] + d['Season']

In [475]:
d

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_opp,stl_per,stl_per_opp,blk_per,blk_per_opp,user_per,user_per_opp,drtg,drtg_opp,team_season
0,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,19.0,13.5,...,38.5,8.7,1.1,6.2,6.3,100.0,100.0,105.7,115.5,SAS0708
1,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,19.0,13.5,...,51.2,1.1,8.7,6.3,6.2,100.0,100.0,115.5,105.7,POR0708
2,2007-10-30,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,-1.5,1.0,...,58.5,7.6,8.6,11.4,13.0,100.0,100.0,111.3,91.3,GSW0708
3,2007-10-30,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,-1.5,1.0,...,59.4,8.6,7.6,13.0,11.4,100.0,100.0,91.3,111.3,UTA0708
4,2007-10-30,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,-3.0,-11.0,...,67.6,17.2,10.7,5.8,7.4,100.0,100.0,102.1,99.9,LAL0708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,-19.0,-18.0,...,74.3,12.7,6.3,7.8,11.1,100.0,100.0,97.0,110.7,TOR1819
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,-1.0,-6.0,...,71.1,6.4,5.3,12.5,13.2,100.0,100.0,112.4,111.3,TOR1819
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,-1.0,-6.0,...,50.0,5.3,6.4,13.2,12.5,100.0,100.0,111.3,112.4,GSW1819
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224,TOR,GSW,13.0,12.5,...,64.1,9.5,8.5,12.2,4.1,100.0,100.0,120.5,116.3,GSW1819


In [476]:
d['game_num'] = d.groupby('team_season').cumcount()+1
d

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,stl_per,stl_per_opp,blk_per,blk_per_opp,user_per,user_per_opp,drtg,drtg_opp,team_season,game_num
0,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,19.0,13.5,...,8.7,1.1,6.2,6.3,100.0,100.0,105.7,115.5,SAS0708,1
1,2007-10-30,184.0,189.5,0708,20071030SanAntonio,203,POR,SAS,19.0,13.5,...,1.1,8.7,6.3,6.2,100.0,100.0,115.5,105.7,POR0708,1
2,2007-10-30,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,-1.5,1.0,...,7.6,8.6,11.4,13.0,100.0,100.0,111.3,91.3,GSW0708,1
3,2007-10-30,214.5,212.0,0708,20071030GoldenState,213,UTA,GSW,-1.5,1.0,...,8.6,7.6,13.0,11.4,100.0,100.0,91.3,111.3,UTA0708,1
4,2007-10-30,191.0,199.0,0708,20071030LALakers,188,HOU,LAL,-3.0,-11.0,...,17.2,10.7,5.8,7.4,100.0,100.0,102.1,99.9,LAL0708,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197,TOR,GSW,-19.0,-18.0,...,12.7,6.3,7.8,11.1,100.0,100.0,97.0,110.7,TOR1819,99
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,-1.0,-6.0,...,6.4,5.3,12.5,13.2,100.0,100.0,112.4,111.3,TOR1819,100
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211,GSW,TOR,-1.0,-6.0,...,5.3,6.4,13.2,12.5,100.0,100.0,111.3,112.4,GSW1819,101
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224,TOR,GSW,13.0,12.5,...,9.5,8.5,12.2,4.1,100.0,100.0,120.5,116.3,GSW1819,102


In [477]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29244 entries, 0 to 29243
Data columns (total 81 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              29244 non-null  datetime64[ns]
 1   O/U_open          29244 non-null  float64       
 2   O/U_close         29244 non-null  float64       
 3   Season            29244 non-null  object        
 4   id                29244 non-null  object        
 5   total             29244 non-null  int64         
 6   away              29244 non-null  object        
 7   home              29244 non-null  object        
 8   ou1               29244 non-null  float64       
 9   ou2               29244 non-null  float64       
 10  Over/Under_open   29244 non-null  int64         
 11  Over/Under_close  29244 non-null  int64         
 12  team              29244 non-null  object        
 13  opp               29244 non-null  object        
 14  home/away         2924

Cast our stat columns as floats to perform some operations on them.

In [478]:
for col in d.iloc[:,15:79].columns:
    d[col] = d[col].astype('float')

In [479]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29244 entries, 0 to 29243
Data columns (total 81 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date              29244 non-null  datetime64[ns]
 1   O/U_open          29244 non-null  float64       
 2   O/U_close         29244 non-null  float64       
 3   Season            29244 non-null  object        
 4   id                29244 non-null  object        
 5   total             29244 non-null  int64         
 6   away              29244 non-null  object        
 7   home              29244 non-null  object        
 8   ou1               29244 non-null  float64       
 9   ou2               29244 non-null  float64       
 10  Over/Under_open   29244 non-null  int64         
 11  Over/Under_close  29244 non-null  int64         
 12  team              29244 non-null  object        
 13  opp               29244 non-null  object        
 14  home/away         2924

In [480]:
d['team_season_game_num'] = d['team_season'] + d['game_num'].astype('str')

In [481]:
d['team_season_game_num']

0          SAS07081
1          POR07081
2          GSW07081
3          UTA07081
4          LAL07081
            ...    
29239     TOR181999
29240    TOR1819100
29241    GSW1819101
29242    GSW1819102
29243    TOR1819101
Name: team_season_game_num, Length: 29244, dtype: object

In [498]:
for stat in d.iloc[:,15:79]:
    d['{}_rolling'.format(stat)] = np.nan

In [499]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29244 entries, 0 to 29243
Columns: 146 entries, Date to drtg_opp_rolling
dtypes: datetime64[ns](1), float64(132), int64(4), object(9)
memory usage: 32.6+ MB


In [509]:
for team in d['team'].unique():
    mask = d['team'] == team
    d4 = d[mask]
    for season in d4['Season'].unique():
        mask = d4['Season'] == '{}'.format(season)
        d5 = d4[mask]
        for stat in list(d5.iloc[:,15:79].columns):
            d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)
        d.update(d5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)


In [512]:
d[d['game_num']>5]

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling
129,2007-11-09,188.0,187.5,0708,20071109Philadelphia,208.0,TOR,PHI,20.0,20.5,...,59.00,65.50,8.76,4.34,6.32,9.14,100.0,100.0,104.40,106.62
135,2007-11-09,198.5,201.5,0708,20071109NewYork,214.0,ORL,NYK,15.5,12.5,...,57.58,52.62,4.88,5.94,6.50,8.06,100.0,100.0,105.86,111.16
140,2007-11-09,188.0,186.0,0708,20071109NewOrleans,182.0,SAS,NOH,-6.0,-4.0,...,57.90,59.48,8.08,5.96,6.76,6.48,100.0,100.0,99.96,110.74
141,2007-11-09,188.0,186.0,0708,20071109NewOrleans,182.0,SAS,NOH,-6.0,-4.0,...,58.94,56.28,8.92,6.30,8.36,7.42,100.0,100.0,98.98,105.48
143,2007-11-09,210.5,209.5,0708,20071109Washington,210.0,DEN,WAS,-0.5,0.5,...,60.84,62.20,8.48,11.20,9.70,7.28,100.0,100.0,104.26,101.44
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29239,2019-06-07,216.0,215.0,1819,20190607GoldenState,197.0,TOR,GSW,-19.0,-18.0,...,59.88,75.18,7.62,6.36,10.50,8.00,100.0,100.0,110.36,116.76
29240,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,59.84,76.36,8.86,7.18,10.56,9.24,100.0,100.0,108.36,116.22
29241,2019-06-10,212.0,217.0,1819,20190610Toronto,211.0,GSW,TOR,-1.0,-6.0,...,76.74,61.92,7.44,8.16,8.02,10.60,100.0,100.0,115.08,109.00
29242,2019-06-13,211.0,211.5,1819,20190613GoldenState,224.0,TOR,GSW,13.0,12.5,...,77.92,58.72,6.60,8.50,8.90,11.10,100.0,100.0,115.12,108.88


For each team, get a single season, calculate rolling averages, then insert into our DataFrame (d). Necessary since every season has potentially different number of games

In [448]:
for season in d4['Season'].unique():
    mask = d4['Season'] == '{}'.format(season)
    d5 = d4[mask]
    d5
    for stat in list(d5.iloc[:,15:79].columns):
        d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)
    #print(len(d4[d4['Season'] == '{}'.format(season)]))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d5['{}_rolling'.format(stat)] = d5.rolling(window=5)['{}'.format(stat)].mean().shift(1)


In [494]:
d5.columns[82:146]

Index(['eFg_rolling', 'tov_rolling', 'orb_rolling', 'ft_fga_rolling',
       'pace_rolling', 'pace_opp_rolling', 'eFg_opp_rolling',
       'tov_opp_rolling', 'orb_opp_rolling', 'ft_fga_opp_rolling',
       'ortg_rolling', 'ortg_opp_rolling', 'fg_rolling', 'fg_opp_rolling',
       'fga_rolling', 'fga_opp_rolling', 'fg_per_rolling',
       'fg_per_opp_rolling', 'threes_rolling', 'threes_opp_rolling',
       'threes_att_rolling', 'threes_att_opp_rolling', 'threes_per_rolling',
       'threes_per_opp_rolling', 'ft_rolling', 'ft_opp_rolling', 'fta_rolling',
       'fta_opp_rolling', 'ft_per_rolling', 'ft_per_opp_rolling',
       'drb_rolling', 'drb_opp_rolling', 'trb_rolling', 'trb_opp_rolling',
       'ast_rolling', 'ast_opp_rolling', 'stl_rolling', 'stl_opp_rolling',
       'blk_rolling', 'blk_opp_rolling', 'to_rolling', 'to_opp_rolling',
       'fouls_rolling', 'fouls_opp_rolling', 'ts_per_rolling',
       'ts_per_opp_rolling', 'threes_ar_rolling', 'threes_ar_opp_rolling',
       'ft_ar_

In [456]:
d5.index.values

array([26684, 26711, 26737, 26757, 26789, 26808, 26825, 26864, 26892,
       26924, 26968, 27007, 27039, 27069, 27126, 27148, 27176, 27219,
       27253, 27260, 27282, 27312, 27344, 27385, 27406, 27443, 27476,
       27500, 27516, 27570, 27594, 27665, 27686, 27711, 27738, 27760,
       27799, 27827, 27859, 27919, 27939, 27978, 28002, 28019, 28071,
       28097, 28132, 28159, 28182, 28196, 28238, 28257, 28299, 28325,
       28367, 28384, 28408, 28436, 28460, 28499, 28526, 28566, 28590,
       28617, 28645, 28693, 28723, 28752, 28782, 28806, 28827, 28846,
       28899, 28929, 28947, 28969, 29021, 29045, 29072])

In [507]:
d5

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling
26684,2018-10-17,217.0,222.0,1819,20181017Charlotte,225,MIL,CHO,8.0,3.0,...,,,,,,,,,,
26711,2018-10-19,217.0,215.5,1819,20181019Orlando,208,CHO,ORL,-9.0,-7.5,...,,,,,,,,,,
26737,2018-10-20,216.5,220.5,1819,20181020Miami,225,CHO,MIA,8.5,4.5,...,,,,,,,,,,
26757,2018-10-22,221.0,225.0,1819,20181022Toronto,233,CHO,TOR,12.0,8.0,...,,,,,,,,,,
26789,2018-10-24,227.0,229.5,1819,20181024Chicago,222,CHO,CHI,-5.0,-7.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28947,2019-04-01,218.0,218.0,1819,20190401Utah,213,CHO,UTA,-5.0,-5.0,...,69.40,66.70,8.96,6.34,6.84,13.36,100.0,100.0,124.18,114.84
28969,2019-04-03,231.0,229.5,1819,20190403NewOrleans,224,CHO,NOP,-7.0,-5.5,...,65.36,68.38,9.96,6.54,5.74,11.56,100.0,100.0,122.98,110.46
29021,2019-04-07,214.5,217.5,1819,20190407Detroit,195,CHO,DET,-19.5,-22.5,...,59.76,68.00,9.86,6.58,6.84,8.74,100.0,100.0,119.10,107.52
29045,2019-04-09,220.5,220.5,1819,20190409Cleveland,221,CHO,CLE,0.5,0.5,...,58.82,68.86,9.34,6.52,6.58,9.48,100.0,100.0,118.26,107.94


In [502]:
d.update(d5)

In [508]:
d.iloc[28947:28970,:]

Unnamed: 0,Date,O/U_open,O/U_close,Season,id,total,away,home,ou1,ou2,...,ast_per_rolling,ast_per_opp_rolling,stl_per_rolling,stl_per_opp_rolling,blk_per_rolling,blk_per_opp_rolling,user_per_rolling,user_per_opp_rolling,drtg_rolling,drtg_opp_rolling
28947,2019-04-01,218.0,218.0,1819,20190401Utah,213.0,CHO,UTA,-5.0,-5.0,...,69.4,66.7,8.96,6.34,6.84,13.36,100.0,100.0,124.18,114.84
28948,2019-04-01,221.5,224.0,1819,20190401Phoenix,235.0,CLE,PHO,13.5,11.0,...,,,,,,,,,,
28949,2019-04-01,221.5,224.0,1819,20190401Phoenix,235.0,CLE,PHO,13.5,11.0,...,,,,,,,,,,
28950,2019-04-02,224.0,226.0,1819,20190402OklahomaCity,222.0,LAL,OKC,-2.0,-4.0,...,,,,,,,,,,
28951,2019-04-02,224.0,226.0,1819,20190402OklahomaCity,222.0,LAL,OKC,-2.0,-4.0,...,,,,,,,,,,
28952,2019-04-02,227.0,230.0,1819,20190402SanAntonio,228.0,ATL,SAS,1.0,-2.0,...,,,,,,,,,,
28953,2019-04-02,227.0,230.0,1819,20190402SanAntonio,228.0,ATL,SAS,1.0,-2.0,...,,,,,,,,,,
28954,2019-04-02,227.0,226.0,1819,20190402Sacramento,235.0,HOU,SAC,8.0,9.0,...,,,,,,,,,,
28955,2019-04-02,227.0,226.0,1819,20190402Sacramento,235.0,HOU,SAC,8.0,9.0,...,,,,,,,,,,
28956,2019-04-02,220.5,220.0,1819,20190402GoldenState,218.0,DEN,GSW,-2.5,-2.0,...,,,,,,,,,,


In [417]:
stat_list = list(d3.iloc[:,5:].columns)

In [435]:
for stat in stat_list:
    d3['{}_rolling'.format(stat)] = d3.rolling(window=5)['{}'.format(stat)].mean().shift(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d3['{}_rolling'.format(stat)] = d3.rolling(window=5)['{}'.format(stat)].mean().shift(1)


In [436]:
d3

Unnamed: 0,team_season_game_num,team_season,Season,team,game_num,eFg,tov,orb,ft_fga,eFg_rolling,tov_rolling,orb_rolling,ft_fga_rolling
0,GSW07081,GSW0708,0708,GSW,1,0.455,17.6,14.9,0.338,,,,
1,GSW07082,GSW0708,0708,GSW,2,0.505,8.7,22.4,0.202,,,,
2,GSW07083,GSW0708,0708,GSW,3,0.589,18.3,29.7,0.215,,,,
3,GSW07084,GSW0708,0708,GSW,4,0.473,9.7,19.1,0.198,,,,
4,GSW07085,GSW0708,0708,GSW,5,0.556,6.4,17.0,0.180,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,GSW070877,GSW0708,0708,GSW,77,0.581,12.8,42.5,0.465,0.4766,12.64,26.40,0.1830
77,GSW070878,GSW0708,0708,GSW,78,0.450,14.0,32.8,0.150,0.4916,13.72,27.92,0.2102
78,GSW070879,GSW0708,0708,GSW,79,0.522,7.8,20.8,0.297,0.4976,14.18,30.48,0.1992
79,GSW070880,GSW0708,0708,GSW,80,0.490,7.4,26.9,0.135,0.5206,13.00,29.94,0.2214


## Scratch Work Graveyard

Processing to merge datasets graveyard

In [None]:
#tried self-merging on game_id, but this is only applicable if in different columns
#df1 = df1.merge(
#            right=df1[opp_pull_cols],
#            left_on=["game_id", "team"],
#            right_on=["game_id", "opp"],
#            suffixes=[None, "_opp"],
#        )

In [None]:
#tried to use drop_duplicates method but can only keep first or last
#df.drop_duplicates(subset=['game_id'], keep='second')

Rolling average graveyard

In [None]:
#found online, talks about rolling average based on multiple columns, never tried
#df.loc[:, 'value_sma_10'] = df.groupby(by='object')[['object', 'period']].rolling(window=10, min_periods=1, on='period').mean().reset_index(level='object')['value']

In [453]:
#found online, got it to work, but not quite applicable to this situation
#span = 5
#sma = d2.rolling(window=span, min_periods=span).mean()[:span]
#rest = d2[span:]
#pd.concat([sma, rest]).ewm(span=span, adjust=False).mean()

In [452]:
#didn't work - ValueError: cannot reindex from a duplicate axis
#d2['eFg_rolling'] = d2.groupby(['team_season','game_num'])['eFg'].rolling(10).mean().droplevel(level=[0])

In [451]:
#df1 = d.copy()
#df1

In [371]:
#df1 = d.groupby(['team_season','game_num']).rolling(5)['eFg'].mean().reset_index(drop=True)

In [450]:
#df1['pace_rolling'] = d.groupby(['team_season','game_num'])[5:,'pace'].transform(lambda x: x.rolling(10, 10).mean())