In [2]:
import pandas as pd, numpy as np, datetime as dt, re
import matplotlib.pyplot as plt, scipy.stats as stats
import sqlalchemy as sql

from db_info import connection_str

In [9]:
engine = sql.create_engine(connection_str)
conn = engine.connect()

In [37]:
df_odds = pd.read_sql("select * from odds", con=conn, index_col="id")
df_time = pd.read_sql("select * from time", con=conn, index_col="id", parse_dates="timestamp")
df_gamecodes = pd.read_sql("select * from gamecodes", con=conn, index_col="id", parse_dates="date")

In [38]:
df_odds.shape

(86128, 9)

In [39]:
df_time.shape

(754886, 5)

In [40]:
df_gamecodes.shape

(5067, 4)

In [41]:
df_time.head()

Unnamed: 0_level_0,timestamp,book,spread,total,game_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,2021-10-30 08:10:27,BOVADA,-7 -110,+224.5 -110,1459741
2,2021-10-30 08:35:12,BOVADA,-7 -110,+223.5 -110,1459741
3,2021-10-30 15:43:56,BOVADA,-7 -110,+224 -110,1459741
4,2021-10-30 15:50:00,BOVADA,-7 -110,+224.5 -110,1459741
5,2021-10-30 16:06:15,BOVADA,-7 -110,+223.5 -110,1459741


In [18]:
# There Are 3 Missing Gamecodes: Investigate
id_regex = re.compile("\d{6,7}") # -12-2017-{6,7}
game_links = []
# Read in List of URL
with open("wanted_links.txt", "r") as f:
    game_links = f.read().splitlines()

In [19]:
print(len(game_links))
games_set = set()

5070


In [20]:
for game in game_links:
    games_set.add(id_regex.search(game)[0])

In [22]:
df_gamecodes.head()

Unnamed: 0,id,home_abbv,away_abbv,date,game_id
0,1,UTA,SAC,2021-11-02 21:00:00,1459841
1,2,DAL,MIA,2021-11-02 19:30:00,1459836
2,3,CHR,CLE,2021-11-01 19:00:00,1459786
3,4,SAN,DAL,2021-11-03 20:30:00,1459896
4,5,ATL,WAS,2021-11-01 19:30:00,1459806


In [27]:
df_gamecodes["game_id"] = df_gamecodes.game_id.astype("str")
df_odds["game_id"] = df_odds.game_id.astype("str")
df_time["game_id"] = df_time.game_id.astype("str")

In [42]:
df_gamecodes[df_gamecodes.game_id.isin(games_set)]

Unnamed: 0_level_0,home_abbv,away_abbv,date,game_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,UTA,SAC,2021-11-02 21:00:00,1459841
2,DAL,MIA,2021-11-02 19:30:00,1459836
3,CHR,CLE,2021-11-01 19:00:00,1459786
4,SAN,DAL,2021-11-03 20:30:00,1459896
5,ATL,WAS,2021-11-01 19:30:00,1459806
...,...,...,...,...
5063,UTA,DEN,2017-10-18 21:00:00,888675
5064,IND,BKN,2017-10-18 19:00:00,888645
5065,SAC,HOU,2017-10-18 22:00:00,888690
5066,LAL,LAC,2017-10-19 22:30:00,888705


In [43]:
df_odds

Unnamed: 0_level_0,team_abbv,book,moneyline,spread,spread_odds,total,over_odds,under_odds,game_id
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,SAC,Opening,305,10.0,-110,220.0,-110,-110,1459841
2,UTA,Opening,-455,-10.0,-110,220.0,-110,-110,1459841
3,SAC,BOVADA,310,9.0,-110,223.0,-110,-110,1459841
4,UTA,BOVADA,-415,-9.0,-110,223.0,-110,-110,1459841
5,SAC,BetOnline,340,9.0,-106,223.0,-110,-110,1459841
...,...,...,...,...,...,...,...,...,...
86124,PHO,MyBookie,120,2.5,-110,219.0,-110,-110,888685
86125,POR,GTBets,-140,-2.5,-110,219.5,-110,-110,888685
86126,PHO,GTBets,120,2.5,-110,219.5,-110,-110,888685
86127,POR,SkyBook,-999,-999.0,-999,-999.0,-999,-999,888685


In [102]:
def seasoner(date):
    if date <= dt.datetime(2018, 6, 8):
        return "17-18"
    elif date <= dt.datetime(2019, 6, 13):
        return "18-19"
    elif date <= dt.datetime(2020, 10, 11):
        return "19-20"
    elif date <= dt.datetime(2021, 7, 21):
        "20-21"
    else:
        return "21-22"

In [118]:
df_gamecodes["season"] = df_gamecodes.date.apply(seasoner)

In [64]:
df_gamecodes.groupby(["season"]).count()

Unnamed: 0_level_0,home_abbv,away_abbv,date,game_id,game_code
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
17-18,1324,1324,1324,1324,1324
18-19,1307,1307,1307,1307,1307
19-20,1143,1143,1143,1143,1143
21-22,119,119,119,119,119


In [53]:
engine = sql.create_engine("sqlite:///../data/interim/NBA.db")
conn = engine.connect()
df_player_basic= pd.read_sql("SELECT * FROM basic_stats", con=conn)

In [55]:
df_player_basic.head(20)

Unnamed: 0,id,name,minutes_played,fg,fga,fg_pct,fg3,fg3a,fg3_pct,ft,...,stl,blk,tov,pf,pts,bpm,dnp,timetype,team,game_code
0,1,Luka Dončić,35.15,7.0,14.0,0.5,4.0,6.0,0.667,4.0,...,0.0,0.0,3.0,1.0,22.0,0.0,0,TotalBasics,DAL,201810260TOR
1,2,DeAndre Jordan,33.5,5.0,10.0,0.5,0.0,0.0,0.0,8.0,...,0.0,1.0,2.0,1.0,18.0,0.0,0,TotalBasics,DAL,201810260TOR
2,3,Wesley Matthews,32.2,9.0,15.0,0.6,3.0,8.0,0.375,0.0,...,0.0,0.0,2.0,4.0,21.0,-8.0,0,TotalBasics,DAL,201810260TOR
3,4,Jalen Brunson,29.6833,3.0,11.0,0.273,2.0,4.0,0.5,0.0,...,1.0,0.0,2.0,2.0,8.0,-3.0,0,TotalBasics,DAL,201810260TOR
4,5,Harrison Barnes,28.2333,5.0,17.0,0.294,1.0,5.0,0.2,3.0,...,0.0,1.0,1.0,2.0,14.0,-24.0,0,TotalBasics,DAL,201810260TOR
5,6,Maxi Kleber,30.3833,2.0,6.0,0.333,1.0,4.0,0.25,3.0,...,2.0,4.0,0.0,0.0,8.0,-1.0,0,TotalBasics,DAL,201810260TOR
6,7,Dorian Finney-Smith,19.95,2.0,6.0,0.333,1.0,2.0,0.5,0.0,...,1.0,0.0,0.0,4.0,5.0,2.0,0,TotalBasics,DAL,201810260TOR
7,8,J.J. Barea,15.5333,3.0,11.0,0.273,0.0,1.0,0.0,1.0,...,0.0,0.0,2.0,0.0,7.0,-5.0,0,TotalBasics,DAL,201810260TOR
8,9,Dwight Powell,14.5,2.0,2.0,1.0,0.0,0.0,0.0,0.0,...,2.0,0.0,2.0,3.0,4.0,-9.0,0,TotalBasics,DAL,201810260TOR
9,10,Daryl Macon,0.8667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0,TotalBasics,DAL,201810260TOR


In [60]:
# Transform game_id into bball ref code 
df_gamecodes["game_code"] = (df_gamecodes.date.apply(dt.datetime.strftime, format="%Y%m%d0")) + df_gamecodes.home_abbv

In [65]:
check_reference = df_player_basic.query("name == 'Team Totals' & timetype == 'TotalBasics'").loc[:, ["team", "game_code"]]

In [75]:
abbv_regex = re.compile("[A-Z]{3}")

In [76]:
check_reference["home_team"] = check_reference.game_code.apply(lambda x: abbv_regex.search(x)[0])

In [80]:
check_reference["home_team_bin"] = (check_reference["team"] == check_reference["home_team"]).astype("int32")

In [81]:
check_reference

Unnamed: 0,team,game_code,home_team,home_team_bin
13,DAL,201810260TOR,TOR,0
109,TOR,201810260TOR,TOR,1
195,BOS,201810190TOR,TOR,0
293,TOR,201810190TOR,TOR,1
391,CLE,201810170TOR,TOR,0
...,...,...,...,...
960383,SAS,202105160SAS,SAS,1
960483,PHI,202105020SAS,SAS,0
960609,SAS,202105020SAS,SAS,1
960721,PHO,202105150SAS,SAS,0


In [86]:
df_gamecodes.shape

(5067, 6)

In [91]:
df_gamecodes.home_abbv.unique()

array(['UTA', 'DAL', 'CHR', 'SAN', 'ATL', 'LAL', 'GS', 'IND', 'BKN',
       'MIN', 'SAC', 'CHI', 'WAS', 'CLE', 'LAC', 'BOS', 'MIL', 'PHI',
       'DET', 'ORL', 'PHO', 'MEM', 'MIA', 'NY', 'TOR', 'DEN', 'NOP',
       'POR', 'HOU', 'OKC'], dtype=object)

In [93]:
check_reference.team.unique()

array(['DAL', 'TOR', 'BOS', 'CLE', 'CHO', 'MIN', 'DET', 'NOP', 'NYK',
       'PHI', 'WAS', 'LAC', 'SAS', 'POR', 'BRK', 'IND', 'GSW', 'PHO',
       'OKC', 'ORL', 'CHI', 'UTA', 'HOU', 'ATL', 'MIL', 'DEN', 'SAC',
       'MIA', 'LAL', 'MEM'], dtype=object)

In [94]:
shark_teams_flipper = {'CHR': 'CHO', 'SAN': 'SAS', 'GS': 'GSW', 'BKN': 'BRK', 'NY': 'NYK'}
df_gamecodes.home_abbv = df_gamecodes.home_abbv.apply(lambda x: shark_teams_flipper[x] if x in shark_teams_flipper.keys() else x)
df_gamecodes.away_abbv = df_gamecodes.away_abbv.apply(lambda x: shark_teams_flipper[x] if x in shark_teams_flipper.keys() else x) 

In [96]:
df_gamecodes["game_code"] = (df_gamecodes.date.apply(dt.datetime.strftime, format="%Y%m%d0")) + df_gamecodes.home_abbv

In [119]:
merged_df = df_gamecodes.merge(check_reference, "outer", on="game_code")

In [145]:
not_in_NBA = merged_df[merged_df.home_team.isnull()].sort_values(['game_code'])[lambda x: x.date < dt.datetime(2021, 10, 17)].game_code.values

In [146]:
not_in_SQL = merged_df[merged_df.home_abbv.isnull()].sort_values("game_code").game_code.unique()

In [150]:
# feed game_codes into game.py
not_in_NBA

array(['202008150POR', '202012220BRK', '202012220LAL', '202012230BOS',
       '202012230CHI', '202012230CLE', '202012230DEN', '202012230IND',
       '202012230MEM', '202012230MIN', '202012230ORL', '202012230PHI',
       '202012230PHO', '202012230POR', '202105180BOS', '202105180IND',
       '202105190LAL', '202105190MEM', '202105200WAS', '202105210GSW'],
      dtype=object)

In [153]:
# look up in odds shark and feed into shark.py
not_in_SQL

array(['201801290ATL', '201803180LAC', '201803180MIN', '201803180NOP',
       '201803180TOR', '201803220CHO', '201803220DAL', '201803220HOU',
       '201803220NOP', '201803220ORL', '201803220SAC', '201803250BRK',
       '201803250GSW', '201803250HOU', '201803250IND', '201803250MIL',
       '201803250OKC', '201803250SAC', '201803250TOR', '201803250WAS',
       '201803260CHO', '201803260DET', '201803260MIN', '201803260PHI',
       '201803260PHO', '201803310BOS', '201803310MIA', '201803310NYK',
       '201803310SAC', '201803310WAS', '201804050CLE', '201804050DEN',
       '201804050HOU', '201804050IND', '201804050MIL', '201804050UTA',
       '201804060BOS', '201804060DET', '201804060LAL', '201804060MEM',
       '201804060NYK', '201804060ORL', '201804060PHI', '201804060PHO',
       '201804060TOR', '201804060WAS', '201811190ATL', '201903260CHO',
       '201903260CLE', '201903260DAL', '201903260DEN', '201903260LAL',
       '201903260MIA', '201903260MIL', '201903260MIN', '201903260NOP',
      

In [183]:
with open("../data/interim/bball_ref_additional.txt", "w") as file:
    for x in not_in_NBA:
        file.write(x + "\n")
    file.close()