In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Usable Code

In [2]:
!pwd

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [3]:
files = sorted([f for f in os.listdir('csv') if '.csv' in f])

In [4]:
files

['07-08.csv',
 '08-09.csv',
 '09-10.csv',
 '10-11.csv',
 '11-12.csv',
 '12-13.csv',
 '13-14.csv',
 '14-15.csv',
 '15-16.csv',
 '16-17.csv',
 '17-18.csv',
 '18-19.csv']

In [5]:
def clean_odds(file):
    '''
    cleans csv file and returns dataframe object that contains the over/under lines (target)
    '''
    df = pd.read_csv(file)
    df = df.iloc[:,:13] #some files contained unnamed extra rows
    df.dropna(axis=0,inplace=True) #and others columns
    
    #makes a list of game_id's equivalent to how many games were played that season
    mylist = []
    for i in range(1, int(df.shape[0]/2 + 1)):
        mylist.append(i)
        mylist.append(i)
    df['game_id'] = mylist #maps game_id's to games (spread across 2 rows)
    
    df = df.merge(df, on='game_id', suffixes=(None,'_2')) #gets games to be in one row (creates 4/game)
    df = df[1::4] #gets correct mapping
    df.reset_index(drop=True, inplace=True)
    
    #make dates include year so we can join with another dataset later
    df['Date'] = df['Date'].astype('int')
    df['Date'] = df['Date'].astype('str')
    #df['Date'] = df['Date'].str.strip('.0')
    dates = []
    for date in df['Date']:
        if int(date) > 1000:
            date = '20'+file[:2]+date #GET FILENAMES AND USE HERE
            dates.append(date)
        else:
            date = '20'+file[3:5]+'0'+date #GET FILENAMES AND USE HERE
            dates.append(date)
    df['Date'] = dates
    
    #take out pick-em's and replace with zero for later transformation
    df['Open'].replace(['PK','pk'],'0',inplace=True)
    df['Close'].replace(['PK','pk'],'0',inplace=True)
    df['Open_2'].replace(['PK','pk'],'0',inplace=True)
    df['Close_2'].replace(['PK','pk'],'0',inplace=True)
    df.replace('197.5u10','197.5',inplace=True) #one unique occurrence
    #map strings as floats for comparison
    df['Open'] = df['Open'].astype('float')
    df['Open_2'] = df['Open_2'].astype('float')
    df['Close'] = df['Close'].astype('float')
    df['Close_2'] = df['Close_2'].astype('float')
    #get correct over/under line at opening and closing of sportsbook
    df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
    df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
    
    #rename columns to correct home/visitor 
    df['Visitor'] = df['Team']
    df['Home'] = df['Team_2']
    
    #drop unneeded info
    df = df[['Date','Home','Visitor','O/U_open','O/U_close']]
    return df

Use this clean_odds function to generate our dataset to combine with our scraped game data.

In [6]:
cd csv

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor/csv


In [7]:
totals = pd.concat([clean_odds(f) for f in files])

In [8]:
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close
0,20071030,SanAntonio,Portland,184.0,189.5
1,20071030,GoldenState,Utah,214.5,212.0
2,20071030,LALakers,Houston,191.0,199.0
3,20071031,Toronto,Philadelphia,190.0,191.0
4,20071031,Indiana,Washington,200.0,203.5
...,...,...,...,...,...
1307,20190602,Toronto,GoldenState,216.0,213.5
1308,20190605,GoldenState,Toronto,214.0,209.5
1309,20190607,GoldenState,Toronto,216.0,215.0
1310,20190610,Toronto,GoldenState,212.0,217.0


In [9]:
totals.reset_index(drop=True,inplace=True)
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close
0,20071030,SanAntonio,Portland,184.0,189.5
1,20071030,GoldenState,Utah,214.5,212.0
2,20071030,LALakers,Houston,191.0,199.0
3,20071031,Toronto,Philadelphia,190.0,191.0
4,20071031,Indiana,Washington,200.0,203.5
...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5
15516,20190605,GoldenState,Toronto,214.0,209.5
15517,20190607,GoldenState,Toronto,216.0,215.0
15518,20190610,Toronto,GoldenState,212.0,217.0


Let's pickle this initial processing of the csv's and subsequent DataFrame. We can always comeback to this if we need it.

In [13]:
with open('lines.pickle', 'wb') as to_write:
    pickle.dump(totals, to_write)

In [14]:
pwd

'/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor/csv'

In [15]:
cd ..

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [16]:
with open('game_df.pickle','rb') as read_file:
    game_df = pickle.load(read_file)
    
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.538,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.512,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.485,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.552,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.577,.301,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.493,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.523,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.527,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.542,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4


In [19]:
team_abbrev = game_df['home'].unique()
team_abbrev

array(['SAS', 'GSW', 'LAL', 'IND', 'ORL', 'TOR', 'NJN', 'CLE', 'MEM',
       'NOH', 'DEN', 'MIA', 'UTA', 'SEA', 'CHA', 'ATL', 'BOS', 'MIN',
       'CHI', 'LAC', 'PHO', 'PHI', 'WAS', 'DAL', 'HOU', 'MIL', 'NYK',
       'DET', 'SAC', 'POR', 'OKC', 'BRK', 'NOP', 'CHO'], dtype=object)

In [20]:
len(team_abbrev)

34

In [21]:
totals['Home'].unique()

array(['SanAntonio', 'GoldenState', 'LALakers', 'Toronto', 'Indiana',
       'Orlando', 'NewJersey', 'Cleveland', 'Memphis', 'NewOrleans',
       'Denver', 'Miami', 'Utah', 'Seattle', 'Charlotte', 'Atlanta',
       'Boston', 'Minnesota', 'Chicago', 'Phoenix', 'LAClippers',
       'Philadelphia', 'Washington', 'Milwaukee', 'Houston', 'Dallas',
       'NewYork', 'Detroit', 'Sacramento', 'Portland', 'OklahomaCity',
       'Brooklyn'], dtype=object)

In [22]:
team_dict = {
    'SAS': 'SanAntonio',
    'GSW': 'GoldenState',
    'LAL': 'LALakers',
    'TOR': 'Toronto',
    'IND': 'Indiana',
    'ORL': 'Orlando',
    'NJN': 'NewJersey',
    'CLE': 'Cleveland',
    'MEM': 'Memphis',
    'NOH': 'NewOrleans',
    'NOP': 'NewOrleans',
    'DEN': 'Denver',
    'MIA': 'Miami',
    'UTA': 'Utah',
    'SEA': 'Seattle',
    'CHA': 'Charlotte',
    'CHO': 'Charlotte',
    'ATL': 'Atlanta',
    'BOS': 'Boston',
    'MIN': 'Minnesota',
    'CHI': 'Chicago',
    'PHO': 'Phoenix',
    'LAC': 'LAClippers',
    'PHI': 'Philadelphia',
    'WAS': 'Washington',
    'MIL': 'Milwaukee',
    'HOU': 'Houston',
    'DAL': 'Dallas',
    'NYK': 'NewYork',
    'DET': 'Detroit',
    'SAC': 'Sacramento',
    'POR': 'Portland',
    'OKC': 'OklahomaCity',
    'BRK': 'Brooklyn',
}

In [23]:
game_df['home'].map(team_dict)

0         SanAntonio
1        GoldenState
2           LALakers
3            Indiana
4            Orlando
            ...     
15516        Toronto
15517    GoldenState
15518    GoldenState
15519        Toronto
15520    GoldenState
Name: home, Length: 15521, dtype: object

In [24]:
game_df['home_team'] = game_df['home'].map(team_dict)

In [25]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.276,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.299,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.105,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.315,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.301,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.404,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.396,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.346,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.376,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto


In [26]:
game_df['game_date'] = game_df['game_id'].apply(lambda x: x[:8])

In [27]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.299,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.494,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.592,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.391,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.479,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando,20071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.277,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.330,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.269,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.318,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610


In [28]:
game_df['id'] = game_df['game_date'] + game_df['home_team']

In [29]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date,id
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,77.8,50.0,51.2,8.7,6.2,100.0,105.7,SanAntonio,20071030,20071030SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,65.2,39.8,59.4,7.6,11.4,100.0,111.3,GoldenState,20071030,20071030GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,68.4,43.0,56.3,17.2,5.8,100.0,102.1,LALakers,20071030,20071030LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,64.6,50.0,52.6,11.0,7.6,100.0,101.2,Indiana,20071031,20071031Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,64.7,46.7,46.9,2.3,12.9,100.0,94.2,Orlando,20071031,20071031Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,85.0,53.8,48.6,8.0,4.2,100.0,109.0,Toronto,20190602,20190602Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,84.8,50.6,69.4,8.0,6.8,100.0,123.5,GoldenState,20190605,20190605GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,82.9,51.9,74.3,6.3,11.1,100.0,110.7,GoldenState,20190607,20190607GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,83.3,53.8,50.0,6.4,12.5,100.0,112.4,Toronto,20190610,20190610Toronto


In [30]:
totals['id'] = totals['Date'] + totals['Home']

In [31]:
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,id
0,20071030,SanAntonio,Portland,184.0,189.5,20071030SanAntonio
1,20071030,GoldenState,Utah,214.5,212.0,20071030GoldenState
2,20071030,LALakers,Houston,191.0,199.0,20071030LALakers
3,20071031,Toronto,Philadelphia,190.0,191.0,20071031Toronto
4,20071031,Indiana,Washington,200.0,203.5,20071031Indiana
...,...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5,20190602Toronto
15516,20190605,GoldenState,Toronto,214.0,209.5,20190605GoldenState
15517,20190607,GoldenState,Toronto,216.0,215.0,20190607GoldenState
15518,20190610,Toronto,GoldenState,212.0,217.0,20190610Toronto


In [32]:
totals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15520 entries, 0 to 15519
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       15520 non-null  object 
 1   Home       15520 non-null  object 
 2   Visitor    15520 non-null  object 
 3   O/U_open   15520 non-null  float64
 4   O/U_close  15520 non-null  float64
 5   id         15520 non-null  object 
dtypes: float64(2), object(4)
memory usage: 727.6+ KB


In [33]:
game_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15521 entries, 0 to 15520
Data columns (total 71 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   game_id       15521 non-null  object
 1   total         15521 non-null  int64 
 2   away          15521 non-null  object
 3   home          15521 non-null  object
 4   pace_v        15521 non-null  object
 5   pace_h        15521 non-null  object
 6   eFg_v         15521 non-null  object
 7   eFg_h         15521 non-null  object
 8   tov_v         15521 non-null  object
 9   tov_h         15521 non-null  object
 10  orb_v         15521 non-null  object
 11  orb_h         15521 non-null  object
 12  ft_fga_v      15521 non-null  object
 13  ft_fga_h      15521 non-null  object
 14  ortg_v        15521 non-null  object
 15  ortg_h        15521 non-null  object
 16  fg_v          15521 non-null  object
 17  fga_v         15521 non-null  object
 18  fg_per_v      15521 non-null  object
 19  thre

Now both databases have a column to merge on. The 'id' column.

In [None]:
df = pd.merge(totals,game_df,on='id')

In [None]:
df

In [None]:
df.shape

In [None]:
df.drop(columns=['Home','Visitor','game_id','home_team','game_date'],inplace=True)

In [None]:
df

In [None]:
df.shape

In [None]:
with open('df.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

## Scratch Work

In [None]:
df = pd.read_csv(files[2])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df = df.iloc[:,:13]

In [None]:
df.info()

In [None]:
pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 101)

In [None]:
df.shape[0]

In [None]:
mylist = []
for i in range(1, int(df.shape[0]/2 + 1)):
    mylist.append(i)
    mylist.append(i)
print(mylist[0],mylist[-1])

In [None]:
len(range(1,int(df.shape[0]/2 + 1)))

In [None]:
df['game_id'] = mylist

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.info()

In [None]:
df = df.merge(df, on='game_id', suffixes=(None,'_2'))

In [None]:
df

In [None]:
df = df[1::4]
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
#sanity check that every game only appears once
df['game_id']

In [None]:
df['Date'] = df['Date'].astype('int')
df['Date']

In [None]:
df['Date'] = df['Date'].astype('str')
df['Date']

In [None]:
df['Date'] = df['Date'].str.strip('.0')

In [None]:
df['Date'].head

In [None]:
dates = []
for date in df['Date']:
    if int(date) > 1000:
        date = '20'+files[2][:2]+date
        dates.append(date)
    else:
        date = '20'+files[2][3:5]+'0'+date
        dates.append(date)
df['Date'] = dates

In [None]:
df['Date']

In [None]:
df

In [None]:
df['Visitor'] = df['Team']
df['Home'] = df['Team_2']
df

In [None]:
df['Open'].replace('pk','0',inplace=True)
df['Close'].replace('pk','0',inplace=True)
df['Open_2'].replace('pk','0',inplace=True)
df['Close_2'].replace('pk','0',inplace=True)
df.replace('197.5u10','197.5',inplace=True)

In [None]:
mask = df['Open'].values == 'pk'
df[mask]

In [None]:
df['Open'] = df['Open'].astype('float')
df['Open_2'] = df['Open_2'].astype('float')
df['Close'] = df['Close'].astype('float')
df['Close_2'] = df['Close_2'].astype('float')

In [None]:
df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
df

In [None]:
df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
df

Sanity check that opening and closing lines operations worked successfully.
So long as numbers are around 200, we know we are ok.

In [None]:
df['O/U_open'].min()

In [None]:
df['O/U_close'].min()

In [None]:
df = df[['Date','Home','Visitor','O/U_open','O/U_close']]
df

### Processing our lines df

In [34]:
lines = totals

### Stat Building

We have the same set of stats for both teams in any one game, so we can build offense/defense for both teams.

In [None]:
'''map stats accordingly:
1. Get 2 "sets" of stats per game:

visitor: offense - _v stat avgs heading into the game; defense - _h stat avgs heading into the game
home: offese - _h stat avgs heading into the game; defense - _v stat avgs heading into the game

2. Map visitor/home stats to respective teams

3. Build dictionary of team's season to be further processed.

{GSW: {..game_35:{offense/defense stats},game_36:{..}

4. Process dictionary to have more stats/potential model features:

{GSW: {..game_35:{offense/defense stats averaged through 34 games},game_36:{..}

5. ?

'''



## Scratch Work Graveyard

In [None]:
#tried self-merging on game_id, but this is only applicable if in different columns
#df1 = df1.merge(
#            right=df1[opp_pull_cols],
#            left_on=["game_id", "team"],
#            right_on=["game_id", "opp"],
#            suffixes=[None, "_opp"],
#        )

In [None]:
#tried to use drop_duplicates method but can only keep first or last
#df.drop_duplicates(subset=['game_id'], keep='second')