In [21]:
import os
import glob
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

# Usable Code

In [3]:
!pwd

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [4]:
files = sorted([f for f in os.listdir('csv') if '.csv' in f])

In [5]:
files

['07-08.csv',
 '08-09.csv',
 '09-10.csv',
 '10-11.csv',
 '11-12.csv',
 '12-13.csv',
 '13-14.csv',
 '14-15.csv',
 '15-16.csv',
 '16-17.csv',
 '17-18.csv',
 '18-19.csv']

In [7]:
def clean_odds(file):
    '''
    cleans csv file and returns dataframe object that contains the over/under lines (target)
    '''
    df = pd.read_csv(file)
    df = df.iloc[:,:13] #some files contained unnamed extra rows
    df.dropna(axis=0,inplace=True) #and others columns
    
    #makes a list of game_id's equivalent to how many games were played that season
    mylist = []
    for i in range(1, int(df.shape[0]/2 + 1)):
        mylist.append(i)
        mylist.append(i)
    df['game_id'] = mylist #maps game_id's to games (spread across 2 rows)
    
    df = df.merge(df, on='game_id', suffixes=(None,'_2')) #gets games to be in one row (creates 4/game)
    df = df[1::4] #gets correct mapping
    df.reset_index(drop=True, inplace=True)
    
    #make dates include year so we can join with another dataset later
    df['Date'] = df['Date'].astype('int')
    df['Date'] = df['Date'].astype('str')
    #df['Date'] = df['Date'].str.strip('.0')
    dates = []
    for date in df['Date']:
        if int(date) > 1000:
            date = '20'+file[:2]+date #GET FILENAMES AND USE HERE
            dates.append(date)
        else:
            date = '20'+file[3:5]+'0'+date #GET FILENAMES AND USE HERE
            dates.append(date)
    df['Date'] = dates
    
    #take out pick-em's and replace with zero for later transformation
    df['Open'].replace(['PK','pk'],'0',inplace=True)
    df['Close'].replace(['PK','pk'],'0',inplace=True)
    df['Open_2'].replace(['PK','pk'],'0',inplace=True)
    df['Close_2'].replace(['PK','pk'],'0',inplace=True)
    df.replace('197.5u10','197.5',inplace=True)
    #map strings as floats for comparison
    df['Open'] = df['Open'].astype('float')
    df['Open_2'] = df['Open_2'].astype('float')
    df['Close'] = df['Close'].astype('float')
    df['Close_2'] = df['Close_2'].astype('float')
    #get correct over/under line at opening and closing of sportsbook
    df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
    df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
    
    #rename columns to correct home/visitor 
    df['Visitor'] = df['Team']
    df['Home'] = df['Team_2']
    
    #drop unneeded info
    df = df[['Date','Home','Visitor','O/U_open','O/U_close']]
    return df

Use this clean_odds function to generate our dataset to combine with our scraped game data.

In [8]:
totals = pd.concat([clean_odds(f) for f in files])

In [15]:
totals[:1316]

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close
0,20071030,SanAntonio,Portland,184.0,189.5
1,20071030,GoldenState,Utah,214.5,212.0
2,20071030,LALakers,Houston,191.0,199.0
3,20071031,Toronto,Philadelphia,190.0,191.0
4,20071031,Indiana,Washington,200.0,203.5
...,...,...,...,...,...
1311,20080608,Boston,LALakers,190.0,192.0
1312,20080610,LALakers,Boston,195.5,195.5
1313,20080612,LALakers,Boston,192.0,191.5
1314,20080615,LALakers,Boston,190.5,193.0


In [64]:
totals.reset_index(drop=True,inplace=True)
totals.drop(columns=['level_0','index'], inplace=True)
totals

KeyError: "['level_0' 'index'] not found in axis"

In [85]:
with open('lines.pickle', 'wb') as to_write:
    pickle.dump(totals, to_write)

In [18]:
pwd

'/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor/csv'

In [19]:
cd ..

/Users/jseemayer/Documents/Me/Metis/NBA-Over-Under-Predictor


In [22]:
with open('game_df.pickle','rb') as read_file:
    game_df = pickle.load(read_file)
    
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.567,.167,.218,72.7,50.0,38.5,1.1,6.3,100.0,115.5
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.553,.122,.400,85.1,60.2,58.5,8.6,13.0,100.0,91.3
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.542,.297,.419,77.1,57.0,67.6,10.7,7.4,100.0,99.9
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.463,.202,.455,70.2,50.0,41.7,5.5,9.5,100.0,109.5
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.429,.195,.253,76.9,53.3,48.5,4.5,7.8,100.0,115.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.592,.415,.280,70.6,46.2,89.5,7.0,8.9,100.0,104.0
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.674,.463,.256,72.9,49.4,69.8,9.0,18.2,100.0,109.5
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.544,.372,.279,80.0,48.1,61.1,12.7,7.8,100.0,97.0
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.601,.512,.171,70.5,46.3,71.1,5.3,13.2,100.0,111.3


In [29]:
team_abbrev = game_df['home'].unique()
team_abbrev

array(['SAS', 'GSW', 'LAL', 'IND', 'ORL', 'TOR', 'NJN', 'CLE', 'MEM',
       'NOH', 'DEN', 'MIA', 'UTA', 'SEA', 'CHA', 'ATL', 'BOS', 'MIN',
       'CHI', 'LAC', 'PHO', 'PHI', 'WAS', 'DAL', 'HOU', 'MIL', 'NYK',
       'DET', 'SAC', 'POR', 'OKC', 'BRK', 'NOP', 'CHO'], dtype=object)

In [30]:
len(team_abbrev)

34

In [33]:
totals['Home'].unique()

array(['SanAntonio', 'GoldenState', 'LALakers', 'Toronto', 'Indiana',
       'Orlando', 'NewJersey', 'Cleveland', 'Memphis', 'NewOrleans',
       'Denver', 'Miami', 'Utah', 'Seattle', 'Charlotte', 'Atlanta',
       'Boston', 'Minnesota', 'Chicago', 'Phoenix', 'LAClippers',
       'Philadelphia', 'Washington', 'Milwaukee', 'Houston', 'Dallas',
       'NewYork', 'Detroit', 'Sacramento', 'Portland', 'OklahomaCity',
       'Brooklyn'], dtype=object)

In [58]:
team_dict = {
    'SAS': 'SanAntonio',
    'GSW': 'GoldenState',
    'LAL': 'LALakers',
    'TOR': 'Toronto',
    'IND': 'Indiana',
    'ORL': 'Orlando',
    'NJN': 'NewJersey',
    'CLE': 'Cleveland',
    'MEM': 'Memphis',
    'NOH': 'NewOrleans',
    'NOP': 'NewOrleans',
    'DEN': 'Denver',
    'MIA': 'Miami',
    'UTA': 'Utah',
    'SEA': 'Seattle',
    'CHA': 'Charlotte',
    'CHO': 'Charlotte',
    'ATL': 'Atlanta',
    'BOS': 'Boston',
    'MIN': 'Minnesota',
    'CHI': 'Chicago',
    'PHO': 'Phoenix',
    'LAC': 'LAClippers',
    'PHI': 'Philadelphia',
    'WAS': 'Washington',
    'MIL': 'Milwaukee',
    'HOU': 'Houston',
    'DAL': 'Dallas',
    'NYK': 'NewYork',
    'DET': 'Detroit',
    'SAC': 'Sacramento',
    'POR': 'Portland',
    'OKC': 'OklahomaCity',
    'BRK': 'Brooklyn',
}

In [59]:
game_df['home'].map(team_dict)

0         SanAntonio
1        GoldenState
2           LALakers
3            Indiana
4            Orlando
            ...     
15516        Toronto
15517    GoldenState
15518    GoldenState
15519        Toronto
15520    GoldenState
Name: home, Length: 15521, dtype: object

In [60]:
game_df['home_team'] = game_df['home'].map(team_dict)

In [61]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.167,.218,72.7,50.0,38.5,1.1,6.3,100.0,115.5,SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.122,.400,85.1,60.2,58.5,8.6,13.0,100.0,91.3,GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.297,.419,77.1,57.0,67.6,10.7,7.4,100.0,99.9,LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.202,.455,70.2,50.0,41.7,5.5,9.5,100.0,109.5,Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.195,.253,76.9,53.3,48.5,4.5,7.8,100.0,115.7,Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.415,.280,70.6,46.2,89.5,7.0,8.9,100.0,104.0,Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.463,.256,72.9,49.4,69.8,9.0,18.2,100.0,109.5,GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.372,.279,80.0,48.1,61.1,12.7,7.8,100.0,97.0,GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.512,.171,70.5,46.3,71.1,5.3,13.2,100.0,111.3,Toronto


In [71]:
game_df['game_date'] = game_df['game_id'].apply(lambda x: x[:8])

In [72]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,.218,72.7,50.0,38.5,1.1,6.3,100.0,115.5,SanAntonio,20071030
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,.400,85.1,60.2,58.5,8.6,13.0,100.0,91.3,GoldenState,20071030
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,.419,77.1,57.0,67.6,10.7,7.4,100.0,99.9,LALakers,20071030
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,.455,70.2,50.0,41.7,5.5,9.5,100.0,109.5,Indiana,20071031
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,.253,76.9,53.3,48.5,4.5,7.8,100.0,115.7,Orlando,20071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,.280,70.6,46.2,89.5,7.0,8.9,100.0,104.0,Toronto,20190602
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,.256,72.9,49.4,69.8,9.0,18.2,100.0,109.5,GoldenState,20190605
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,.279,80.0,48.1,61.1,12.7,7.8,100.0,97.0,GoldenState,20190607
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,.171,70.5,46.3,71.1,5.3,13.2,100.0,111.3,Toronto,20190610


In [74]:
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,team_abbrev
0,20071030,SanAntonio,Portland,184.0,189.5,SAS
1,20071030,GoldenState,Utah,214.5,212.0,GSW
2,20071030,LALakers,Houston,191.0,199.0,LAL
3,20071031,Toronto,Philadelphia,190.0,191.0,TOR
4,20071031,Indiana,Washington,200.0,203.5,IND
...,...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5,TOR
15516,20190605,GoldenState,Toronto,214.0,209.5,GSW
15517,20190607,GoldenState,Toronto,216.0,215.0,GSW
15518,20190610,Toronto,GoldenState,212.0,217.0,TOR


In [76]:
game_df['id'] = game_df['game_date'] + game_df['home_team']

In [77]:
game_df

Unnamed: 0,game_id,total,away,home,pace_v,pace_h,eFg_v,eFg_h,tov_v,tov_h,...,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date,id
0,200710300SAS,203,POR,SAS,91.8,91.8,.538,.506,15.8,7.5,...,72.7,50.0,38.5,1.1,6.3,100.0,115.5,SanAntonio,20071030,20071030SanAntonio
1,200710300GSW,213,UTA,GSW,105.1,105.1,.483,.455,15.2,17.6,...,85.1,60.2,58.5,8.6,13.0,100.0,91.3,GoldenState,20071030,20071030GoldenState
2,200710300LAL,188,HOU,LAL,93.0,93.0,.500,.434,17.0,11.1,...,77.1,57.0,67.6,10.7,7.4,100.0,99.9,LALakers,20071030,20071030LALakers
3,200710310IND,229,WAS,IND,98.4,98.4,.394,.484,13.2,13.6,...,70.2,50.0,41.7,5.5,9.5,100.0,109.5,Indiana,20071031,20071031Indiana
4,200710310ORL,185,MIL,ORL,88.1,88.1,.402,.521,12.6,10.2,...,76.9,53.3,48.5,4.5,7.8,100.0,115.7,Orlando,20071031,20071031Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15516,201906020TOR,213,GSW,TOR,100.0,100.0,.543,.431,14.0,12.5,...,70.6,46.2,89.5,7.0,8.9,100.0,104.0,Toronto,20190602,20190602Toronto
15517,201906050GSW,232,TOR,GSW,99.6,99.6,.628,.462,13.3,11.8,...,72.9,49.4,69.8,9.0,18.2,100.0,109.5,GoldenState,20190605,20190605GoldenState
15518,201906070GSW,197,TOR,GSW,94.8,94.8,.477,.500,8.5,16.3,...,80.0,48.1,61.1,12.7,7.8,100.0,97.0,GoldenState,20190607,20190607GoldenState
15519,201906100TOR,211,GSW,TOR,94.3,94.3,.585,.494,14.5,11.8,...,70.5,46.3,71.1,5.3,13.2,100.0,111.3,Toronto,20190610,20190610Toronto


In [78]:
totals['id'] = totals['Date'] + totals['Home']

In [79]:
totals

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,team_abbrev,id
0,20071030,SanAntonio,Portland,184.0,189.5,SAS,20071030SanAntonio
1,20071030,GoldenState,Utah,214.5,212.0,GSW,20071030GoldenState
2,20071030,LALakers,Houston,191.0,199.0,LAL,20071030LALakers
3,20071031,Toronto,Philadelphia,190.0,191.0,TOR,20071031Toronto
4,20071031,Indiana,Washington,200.0,203.5,IND,20071031Indiana
...,...,...,...,...,...,...,...
15515,20190602,Toronto,GoldenState,216.0,213.5,TOR,20190602Toronto
15516,20190605,GoldenState,Toronto,214.0,209.5,GSW,20190605GoldenState
15517,20190607,GoldenState,Toronto,216.0,215.0,GSW,20190607GoldenState
15518,20190610,Toronto,GoldenState,212.0,217.0,TOR,20190610Toronto


In [80]:
totals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15520 entries, 0 to 15519
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         15520 non-null  object 
 1   Home         15520 non-null  object 
 2   Visitor      15520 non-null  object 
 3   O/U_open     15520 non-null  float64
 4   O/U_close    15520 non-null  float64
 5   team_abbrev  15520 non-null  object 
 6   id           15520 non-null  object 
dtypes: float64(2), object(5)
memory usage: 848.9+ KB


In [81]:
game_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15521 entries, 0 to 15520
Data columns (total 71 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   game_id       15521 non-null  object
 1   total         15521 non-null  int64 
 2   away          15521 non-null  object
 3   home          15521 non-null  object
 4   pace_v        15521 non-null  object
 5   pace_h        15521 non-null  object
 6   eFg_v         15521 non-null  object
 7   eFg_h         15521 non-null  object
 8   tov_v         15521 non-null  object
 9   tov_h         15521 non-null  object
 10  orb_v         15521 non-null  object
 11  orb_h         15521 non-null  object
 12  ft_fga_v      15521 non-null  object
 13  ft_fga_h      15521 non-null  object
 14  ortg_v        15521 non-null  object
 15  ortg_h        15521 non-null  object
 16  fg_v          15521 non-null  object
 17  fga_v         15521 non-null  object
 18  fg_per_v      15521 non-null  object
 19  thre

In [82]:
df = pd.merge(totals,game_df,on='id')

In [83]:
df

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close,team_abbrev,id,game_id,total,away,...,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h,home_team,game_date
0,20071030,SanAntonio,Portland,184.0,189.5,SAS,20071030SanAntonio,200710300SAS,203,POR,...,.218,72.7,50.0,38.5,1.1,6.3,100.0,115.5,SanAntonio,20071030
1,20071030,GoldenState,Utah,214.5,212.0,GSW,20071030GoldenState,200710300GSW,213,UTA,...,.400,85.1,60.2,58.5,8.6,13.0,100.0,91.3,GoldenState,20071030
2,20071030,LALakers,Houston,191.0,199.0,LAL,20071030LALakers,200710300LAL,188,HOU,...,.419,77.1,57.0,67.6,10.7,7.4,100.0,99.9,LALakers,20071030
3,20071031,Toronto,Philadelphia,190.0,191.0,TOR,20071031Toronto,200710310TOR,203,PHI,...,.280,75.6,55.4,57.9,3.3,13.0,100.0,116.7,Toronto,20071031
4,20071031,Indiana,Washington,200.0,203.5,IND,20071031Indiana,200710310IND,229,WAS,...,.455,70.2,50.0,41.7,5.5,9.5,100.0,109.5,Indiana,20071031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,Toronto,GoldenState,216.0,213.5,TOR,20190602Toronto,201906020TOR,213,GSW,...,.280,70.6,46.2,89.5,7.0,8.9,100.0,104.0,Toronto,20190602
15011,20190605,GoldenState,Toronto,214.0,209.5,GSW,20190605GoldenState,201906050GSW,232,TOR,...,.256,72.9,49.4,69.8,9.0,18.2,100.0,109.5,GoldenState,20190605
15012,20190607,GoldenState,Toronto,216.0,215.0,GSW,20190607GoldenState,201906070GSW,197,TOR,...,.279,80.0,48.1,61.1,12.7,7.8,100.0,97.0,GoldenState,20190607
15013,20190610,Toronto,GoldenState,212.0,217.0,TOR,20190610Toronto,201906100TOR,211,GSW,...,.171,70.5,46.3,71.1,5.3,13.2,100.0,111.3,Toronto,20190610


In [84]:
df.shape

(15015, 77)

In [86]:
df.drop(columns=['Home','Visitor','game_id','home_team','game_date'],inplace=True)

In [87]:
df

Unnamed: 0,Date,O/U_open,O/U_close,team_abbrev,id,total,away,home,pace_v,pace_h,...,ts_per_h,threes_ar_h,ft_ar_h,drb_per_h,trb_per_h,ast_per_h,stl_per_h,blk_per_h,user_per_h,drtg_h
0,20071030,184.0,189.5,SAS,20071030SanAntonio,203,POR,SAS,91.8,91.8,...,.567,.167,.218,72.7,50.0,38.5,1.1,6.3,100.0,115.5
1,20071030,214.5,212.0,GSW,20071030GoldenState,213,UTA,GSW,105.1,105.1,...,.553,.122,.400,85.1,60.2,58.5,8.6,13.0,100.0,91.3
2,20071030,191.0,199.0,LAL,20071030LALakers,188,HOU,LAL,93.0,93.0,...,.542,.297,.419,77.1,57.0,67.6,10.7,7.4,100.0,99.9
3,20071031,190.0,191.0,TOR,20071031Toronto,203,PHI,TOR,90.9,90.9,...,.526,.207,.280,75.6,55.4,57.9,3.3,13.0,100.0,116.7
4,20071031,200.0,203.5,IND,20071031Indiana,229,WAS,IND,98.4,98.4,...,.463,.202,.455,70.2,50.0,41.7,5.5,9.5,100.0,109.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15010,20190602,216.0,213.5,TOR,20190602Toronto,213,GSW,TOR,100.0,100.0,...,.592,.415,.280,70.6,46.2,89.5,7.0,8.9,100.0,104.0
15011,20190605,214.0,209.5,GSW,20190605GoldenState,232,TOR,GSW,99.6,99.6,...,.674,.463,.256,72.9,49.4,69.8,9.0,18.2,100.0,109.5
15012,20190607,216.0,215.0,GSW,20190607GoldenState,197,TOR,GSW,94.8,94.8,...,.544,.372,.279,80.0,48.1,61.1,12.7,7.8,100.0,97.0
15013,20190610,212.0,217.0,TOR,20190610Toronto,211,GSW,TOR,94.3,94.3,...,.601,.512,.171,70.5,46.3,71.1,5.3,13.2,100.0,111.3


In [88]:
df.shape

(15015, 72)

In [89]:
with open('df.pickle', 'wb') as to_write:
    pickle.dump(df, to_write)

## Scratch Work

In [195]:
df = pd.read_csv(files[2])

In [196]:
df.head()

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19
0,1027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,,,,,,,
1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5.0,-200.0,4.5,,,,,,,
2,1027,503,V,Washington,21,35,21,25,102,205.0,210.0,320.0,107.0,,,,,,,
3,1027,504,H,Dallas,21,29,24,17,91,8.5,8.5,-400.0,8.0,,,,,,,
4,1027,505,V,Houston,23,18,15,31,87,187.5,188.0,400.0,93.5,,,,,,,


In [197]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2624 entries, 0 to 2623
Data columns (total 20 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         2624 non-null   int64  
 1   Rot          2624 non-null   int64  
 2   VH           2624 non-null   object 
 3   Team         2624 non-null   object 
 4   1st          2624 non-null   int64  
 5   2nd          2624 non-null   int64  
 6   3rd          2624 non-null   int64  
 7   4th          2624 non-null   int64  
 8   Final        2624 non-null   int64  
 9   Open         2624 non-null   object 
 10  Close        2624 non-null   object 
 11  ML           2624 non-null   float64
 12  2H           2624 non-null   object 
 13  Unnamed: 13  0 non-null      float64
 14  Unnamed: 14  0 non-null      float64
 15  Unnamed: 15  0 non-null      float64
 16  Unnamed: 16  0 non-null      float64
 17  Unnamed: 17  0 non-null      float64
 18  Unnamed: 18  0 non-null      float64
 19  Unname

In [198]:
df = df.iloc[:,:13]

In [199]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2624 entries, 0 to 2623
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    2624 non-null   int64  
 1   Rot     2624 non-null   int64  
 2   VH      2624 non-null   object 
 3   Team    2624 non-null   object 
 4   1st     2624 non-null   int64  
 5   2nd     2624 non-null   int64  
 6   3rd     2624 non-null   int64  
 7   4th     2624 non-null   int64  
 8   Final   2624 non-null   int64  
 9   Open    2624 non-null   object 
 10  Close   2624 non-null   object 
 11  ML      2624 non-null   float64
 12  2H      2624 non-null   object 
dtypes: float64(1), int64(7), object(5)
memory usage: 266.6+ KB


In [200]:
pd.set_option("display.max_columns", 101)
pd.set_option("display.max_rows", 101)

In [201]:
df.shape[0]

2624

In [202]:
mylist = []
for i in range(1, int(df.shape[0]/2 + 1)):
    mylist.append(i)
    mylist.append(i)
print(mylist[0],mylist[-1])

1 1312


In [203]:
len(range(1,int(df.shape[0]/2 + 1)))

1312

In [204]:
df['game_id'] = mylist

In [205]:
df.dropna(axis=0,inplace=True)

In [206]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2624 entries, 0 to 2623
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Date     2624 non-null   int64  
 1   Rot      2624 non-null   int64  
 2   VH       2624 non-null   object 
 3   Team     2624 non-null   object 
 4   1st      2624 non-null   int64  
 5   2nd      2624 non-null   int64  
 6   3rd      2624 non-null   int64  
 7   4th      2624 non-null   int64  
 8   Final    2624 non-null   int64  
 9   Open     2624 non-null   object 
 10  Close    2624 non-null   object 
 11  ML       2624 non-null   float64
 12  2H       2624 non-null   object 
 13  game_id  2624 non-null   int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 307.5+ KB


In [207]:
df = df.merge(df, on='game_id', suffixes=(None,'_2'))

In [208]:
df

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,game_id,Date_2,Rot_2,VH_2,Team_2,1st_2,2nd_2,3rd_2,4th_2,Final_2,Open_2,Close_2,ML_2,2H_2
0,1027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,1,1027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5
1,1027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5,-200.0,4.5
2,1027,502,H,Cleveland,28,17,20,24,89,4.5,5,-200.0,4.5,1,1027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5
3,1027,502,H,Cleveland,28,17,20,24,89,4.5,5,-200.0,4.5,1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5,-200.0,4.5
4,1027,503,V,Washington,21,35,21,25,102,205,210,320.0,107,2,1027,503,V,Washington,21,35,21,25,102,205,210,320.0,107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5243,615,712,H,LALakers,28,23,25,13,89,6.5,6,-250.0,95.5,1311,615,712,H,LALakers,28,23,25,13,89,6.5,6,-250.0,95.5
5244,617,713,V,Boston,23,17,17,22,79,188,186.5,230.0,92.5,1312,617,713,V,Boston,23,17,17,22,79,188,186.5,230.0,92.5
5245,617,713,V,Boston,23,17,17,22,79,188,186.5,230.0,92.5,1312,617,714,H,LALakers,14,20,19,30,83,6.5,6.5,-270.0,7
5246,617,714,H,LALakers,14,20,19,30,83,6.5,6.5,-270.0,7,1312,617,713,V,Boston,23,17,17,22,79,188,186.5,230.0,92.5


In [209]:
df = df[1::4]
df.reset_index(drop=True, inplace=True)

In [210]:
df

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,game_id,Date_2,Rot_2,VH_2,Team_2,1st_2,2nd_2,3rd_2,4th_2,Final_2,Open_2,Close_2,ML_2,2H_2
0,1027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5,-200.0,4.5
1,1027,503,V,Washington,21,35,21,25,102,205,210,320.0,107,2,1027,504,H,Dallas,21,29,24,17,91,8.5,8.5,-400.0,8
2,1027,505,V,Houston,23,18,15,31,87,187.5,188,400.0,93.5,3,1027,506,H,Portland,23,31,21,21,96,10,9,-500.0,1.5
3,1027,507,V,LAClippers,22,27,26,17,92,208,202,625.0,102,4,1027,508,H,LALakers,32,27,17,23,99,10,12,-850.0,3.5
4,1028,701,V,Philadelphia,27,20,22,37,106,194,194.5,450.0,0.5,5,1028,702,H,Orlando,29,41,30,20,120,8,10,-600.0,96.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,608,705,V,LALakers,26,26,15,24,91,192.5,191,125.0,96,1308,608,706,H,Boston,17,23,21,23,84,2,2.5,-145.0,4
1308,610,707,V,LALakers,16,29,17,27,89,191,190.5,160.0,95.5,1309,610,708,H,Boston,19,23,18,36,96,3,4,-180.0,4
1309,613,709,V,LALakers,20,19,26,21,86,189,189.5,110.0,1.5,1310,613,710,H,Boston,22,23,28,19,92,3.5,2,-130.0,95.5
1310,615,711,V,Boston,18,13,20,16,67,187,190.5,210.0,2.5,1311,615,712,H,LALakers,28,23,25,13,89,6.5,6,-250.0,95.5


In [211]:
#sanity check that every game only appears once
df['game_id']

0          1
1          2
2          3
3          4
4          5
        ... 
1307    1308
1308    1309
1309    1310
1310    1311
1311    1312
Name: game_id, Length: 1312, dtype: int64

In [212]:
df['Date'] = df['Date'].astype('int')
df['Date']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = df['Date'].astype('int')


0       1027
1       1027
2       1027
3       1027
4       1028
        ... 
1307     608
1308     610
1309     613
1310     615
1311     617
Name: Date, Length: 1312, dtype: int64

In [213]:
df['Date'] = df['Date'].astype('str')
df['Date']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = df['Date'].astype('str')


0       1027
1       1027
2       1027
3       1027
4       1028
        ... 
1307     608
1308     610
1309     613
1310     615
1311     617
Name: Date, Length: 1312, dtype: object

In [214]:
df['Date'] = df['Date'].str.strip('.0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = df['Date'].str.strip('.0')


In [215]:
df['Date'].head

<bound method NDFrame.head of 0       1027
1       1027
2       1027
3       1027
4       1028
        ... 
1307     608
1308      61
1309     613
1310     615
1311     617
Name: Date, Length: 1312, dtype: object>

In [216]:
dates = []
for date in df['Date']:
    if int(date) > 1000:
        date = '20'+files[2][:2]+date
        dates.append(date)
    else:
        date = '20'+files[2][3:5]+'0'+date
        dates.append(date)
df['Date'] = dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = dates


In [217]:
df['Date']

0       20091027
1       20091027
2       20091027
3       20091027
4       20091028
          ...   
1307    20100608
1308     2010061
1309    20100613
1310    20100615
1311    20100617
Name: Date, Length: 1312, dtype: object

In [218]:
df

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,game_id,Date_2,Rot_2,VH_2,Team_2,1st_2,2nd_2,3rd_2,4th_2,Final_2,Open_2,Close_2,ML_2,2H_2
0,20091027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5,-200.0,4.5
1,20091027,503,V,Washington,21,35,21,25,102,205,210,320.0,107,2,1027,504,H,Dallas,21,29,24,17,91,8.5,8.5,-400.0,8
2,20091027,505,V,Houston,23,18,15,31,87,187.5,188,400.0,93.5,3,1027,506,H,Portland,23,31,21,21,96,10,9,-500.0,1.5
3,20091027,507,V,LAClippers,22,27,26,17,92,208,202,625.0,102,4,1027,508,H,LALakers,32,27,17,23,99,10,12,-850.0,3.5
4,20091028,701,V,Philadelphia,27,20,22,37,106,194,194.5,450.0,0.5,5,1028,702,H,Orlando,29,41,30,20,120,8,10,-600.0,96.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,20100608,705,V,LALakers,26,26,15,24,91,192.5,191,125.0,96,1308,608,706,H,Boston,17,23,21,23,84,2,2.5,-145.0,4
1308,2010061,707,V,LALakers,16,29,17,27,89,191,190.5,160.0,95.5,1309,610,708,H,Boston,19,23,18,36,96,3,4,-180.0,4
1309,20100613,709,V,LALakers,20,19,26,21,86,189,189.5,110.0,1.5,1310,613,710,H,Boston,22,23,28,19,92,3.5,2,-130.0,95.5
1310,20100615,711,V,Boston,18,13,20,16,67,187,190.5,210.0,2.5,1311,615,712,H,LALakers,28,23,25,13,89,6.5,6,-250.0,95.5


In [219]:
df['Visitor'] = df['Team']
df['Home'] = df['Team_2']
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Visitor'] = df['Team']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Home'] = df['Team_2']


Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,game_id,Date_2,Rot_2,VH_2,Team_2,1st_2,2nd_2,3rd_2,4th_2,Final_2,Open_2,Close_2,ML_2,2H_2,Visitor,Home
0,20091027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5,-200.0,4.5,Boston,Cleveland
1,20091027,503,V,Washington,21,35,21,25,102,205,210,320.0,107,2,1027,504,H,Dallas,21,29,24,17,91,8.5,8.5,-400.0,8,Washington,Dallas
2,20091027,505,V,Houston,23,18,15,31,87,187.5,188,400.0,93.5,3,1027,506,H,Portland,23,31,21,21,96,10,9,-500.0,1.5,Houston,Portland
3,20091027,507,V,LAClippers,22,27,26,17,92,208,202,625.0,102,4,1027,508,H,LALakers,32,27,17,23,99,10,12,-850.0,3.5,LAClippers,LALakers
4,20091028,701,V,Philadelphia,27,20,22,37,106,194,194.5,450.0,0.5,5,1028,702,H,Orlando,29,41,30,20,120,8,10,-600.0,96.5,Philadelphia,Orlando
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,20100608,705,V,LALakers,26,26,15,24,91,192.5,191,125.0,96,1308,608,706,H,Boston,17,23,21,23,84,2,2.5,-145.0,4,LALakers,Boston
1308,2010061,707,V,LALakers,16,29,17,27,89,191,190.5,160.0,95.5,1309,610,708,H,Boston,19,23,18,36,96,3,4,-180.0,4,LALakers,Boston
1309,20100613,709,V,LALakers,20,19,26,21,86,189,189.5,110.0,1.5,1310,613,710,H,Boston,22,23,28,19,92,3.5,2,-130.0,95.5,LALakers,Boston
1310,20100615,711,V,Boston,18,13,20,16,67,187,190.5,210.0,2.5,1311,615,712,H,LALakers,28,23,25,13,89,6.5,6,-250.0,95.5,Boston,LALakers


In [220]:
df['Open'].replace('pk','0',inplace=True)
df['Close'].replace('pk','0',inplace=True)
df['Open_2'].replace('pk','0',inplace=True)
df['Close_2'].replace('pk','0',inplace=True)
df.replace('197.5u10','197.5',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [221]:
mask = df['Open'].values == 'pk'
df[mask]

Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,game_id,Date_2,Rot_2,VH_2,Team_2,1st_2,2nd_2,3rd_2,4th_2,Final_2,Open_2,Close_2,ML_2,2H_2,Visitor,Home


In [222]:
df['Open'] = df['Open'].astype('float')
df['Open_2'] = df['Open_2'].astype('float')
df['Close'] = df['Close'].astype('float')
df['Close_2'] = df['Close_2'].astype('float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Open'] = df['Open'].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Open_2'] = df['Open_2'].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Close'] = df['Close'].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .l

In [223]:
df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['O/U_open'] = np.where(df['Open'] > df['Open_2'],df['Open'],df['Open_2'])


Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,game_id,Date_2,Rot_2,VH_2,Team_2,1st_2,2nd_2,3rd_2,4th_2,Final_2,Open_2,Close_2,ML_2,2H_2,Visitor,Home,O/U_open
0,20091027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5.0,-200.0,4.5,Boston,Cleveland,187.5
1,20091027,503,V,Washington,21,35,21,25,102,205.0,210.0,320.0,107,2,1027,504,H,Dallas,21,29,24,17,91,8.5,8.5,-400.0,8,Washington,Dallas,205.0
2,20091027,505,V,Houston,23,18,15,31,87,187.5,188.0,400.0,93.5,3,1027,506,H,Portland,23,31,21,21,96,10.0,9.0,-500.0,1.5,Houston,Portland,187.5
3,20091027,507,V,LAClippers,22,27,26,17,92,208.0,202.0,625.0,102,4,1027,508,H,LALakers,32,27,17,23,99,10.0,12.0,-850.0,3.5,LAClippers,LALakers,208.0
4,20091028,701,V,Philadelphia,27,20,22,37,106,194.0,194.5,450.0,0.5,5,1028,702,H,Orlando,29,41,30,20,120,8.0,10.0,-600.0,96.5,Philadelphia,Orlando,194.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,20100608,705,V,LALakers,26,26,15,24,91,192.5,191.0,125.0,96,1308,608,706,H,Boston,17,23,21,23,84,2.0,2.5,-145.0,4,LALakers,Boston,192.5
1308,2010061,707,V,LALakers,16,29,17,27,89,191.0,190.5,160.0,95.5,1309,610,708,H,Boston,19,23,18,36,96,3.0,4.0,-180.0,4,LALakers,Boston,191.0
1309,20100613,709,V,LALakers,20,19,26,21,86,189.0,189.5,110.0,1.5,1310,613,710,H,Boston,22,23,28,19,92,3.5,2.0,-130.0,95.5,LALakers,Boston,189.0
1310,20100615,711,V,Boston,18,13,20,16,67,187.0,190.5,210.0,2.5,1311,615,712,H,LALakers,28,23,25,13,89,6.5,6.0,-250.0,95.5,Boston,LALakers,187.0


In [224]:
df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['O/U_close'] = np.where(df['Close'] > df['Close_2'],df['Close'],df['Close_2'])


Unnamed: 0,Date,Rot,VH,Team,1st,2nd,3rd,4th,Final,Open,Close,ML,2H,game_id,Date_2,Rot_2,VH_2,Team_2,1st_2,2nd_2,3rd_2,4th_2,Final_2,Open_2,Close_2,ML_2,2H_2,Visitor,Home,O/U_open,O/U_close
0,20091027,501,V,Boston,21,30,21,23,95,187.5,183.5,170.0,90.5,1,1027,502,H,Cleveland,28,17,20,24,89,4.5,5.0,-200.0,4.5,Boston,Cleveland,187.5,183.5
1,20091027,503,V,Washington,21,35,21,25,102,205.0,210.0,320.0,107,2,1027,504,H,Dallas,21,29,24,17,91,8.5,8.5,-400.0,8,Washington,Dallas,205.0,210.0
2,20091027,505,V,Houston,23,18,15,31,87,187.5,188.0,400.0,93.5,3,1027,506,H,Portland,23,31,21,21,96,10.0,9.0,-500.0,1.5,Houston,Portland,187.5,188.0
3,20091027,507,V,LAClippers,22,27,26,17,92,208.0,202.0,625.0,102,4,1027,508,H,LALakers,32,27,17,23,99,10.0,12.0,-850.0,3.5,LAClippers,LALakers,208.0,202.0
4,20091028,701,V,Philadelphia,27,20,22,37,106,194.0,194.5,450.0,0.5,5,1028,702,H,Orlando,29,41,30,20,120,8.0,10.0,-600.0,96.5,Philadelphia,Orlando,194.0,194.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1307,20100608,705,V,LALakers,26,26,15,24,91,192.5,191.0,125.0,96,1308,608,706,H,Boston,17,23,21,23,84,2.0,2.5,-145.0,4,LALakers,Boston,192.5,191.0
1308,2010061,707,V,LALakers,16,29,17,27,89,191.0,190.5,160.0,95.5,1309,610,708,H,Boston,19,23,18,36,96,3.0,4.0,-180.0,4,LALakers,Boston,191.0,190.5
1309,20100613,709,V,LALakers,20,19,26,21,86,189.0,189.5,110.0,1.5,1310,613,710,H,Boston,22,23,28,19,92,3.5,2.0,-130.0,95.5,LALakers,Boston,189.0,189.5
1310,20100615,711,V,Boston,18,13,20,16,67,187.0,190.5,210.0,2.5,1311,615,712,H,LALakers,28,23,25,13,89,6.5,6.0,-250.0,95.5,Boston,LALakers,187.0,190.5


Sanity check that opening and closing lines operations worked successfully.
So long as numbers are around 200, we know we are ok.

In [225]:
df['O/U_open'].min()

172.0

In [226]:
df['O/U_close'].min()

171.5

In [227]:
df = df[['Date','Home','Visitor','O/U_open','O/U_close']]
df

Unnamed: 0,Date,Home,Visitor,O/U_open,O/U_close
0,20091027,Cleveland,Boston,187.5,183.5
1,20091027,Dallas,Washington,205.0,210.0
2,20091027,Portland,Houston,187.5,188.0
3,20091027,LALakers,LAClippers,208.0,202.0
4,20091028,Orlando,Philadelphia,194.0,194.5
...,...,...,...,...,...
1307,20100608,Boston,LALakers,192.5,191.0
1308,2010061,Boston,LALakers,191.0,190.5
1309,20100613,Boston,LALakers,189.0,189.5
1310,20100615,LALakers,Boston,187.0,190.5


## Scratch Work Graveyard

In [None]:
#tried self-merging on game_id, but this is only applicable if in different columns
#df1 = df1.merge(
#            right=df1[opp_pull_cols],
#            left_on=["game_id", "team"],
#            right_on=["game_id", "opp"],
#            suffixes=[None, "_opp"],
#        )

In [None]:
#tried to use drop_duplicates method but can only keep first or last
#df.drop_duplicates(subset=['game_id'], keep='second')