In [10]:
import sys
 
# adding Folder_2 to the system path
sys.path.insert(0, '../utils')

from typing import Dict, List
import argparse

from config import Config
from pgConnect import PgConnection
from dfs_dao import Dfs_dao
from requestLimiter import RequestLimiter
from teamRosterReader import TeamRosterReader, learn_teams_from_summary
from scheduleReader import BoxscoreReader, learn_schedule_from_month
from bs4 import BeautifulSoup


In [11]:
# ======
# 1. Read configs
# ======
config : Config = Config()
pgc : PgConnection = PgConnection(config)

# reader 
read_constants : Dict[str, str] = config.parse_section('reader')
BASE : str = read_constants['base']
NAME : str = BASE[BASE.find('.') + 1:]

# requestLimiter
rl_constants : Dict[str, str] = config.parse_section('requestLimiter')
load_loc = rl_constants['load_location']
LOAD_FILE : str = f'{load_loc}{NAME}.p'
INTERVAL : int = int(rl_constants['interval'])
LIMIT : int = int(rl_constants['limit'])

# ======
# 2. Parse args
# ======
YEAR : int = 2023

rl : RequestLimiter = RequestLimiter(BASE, 
                    interval = INTERVAL, 
                    limit = LIMIT - 1, 
                    load = LOAD_FILE)
trr : TeamRosterReader = TeamRosterReader(None, None, YEAR, rl)
br : BoxscoreReader = BoxscoreReader(rl)
dao : Dfs_dao = Dfs_dao(pgc)

schedule_base = BASE + '/leagues/NBA_{}_games-{}.html'

MONTHS : List[str] = ['october', 
            'november', 
            'december', 
            'january',
            'february',
            'march',
            'april',
            'may',
            'june']

Connecting to the PostgreSQL database...
In constructor...
Successfully loaded previous Rate Limiter info for https://www.basketball-reference.com
Initialized with 2 of 19 entries filled

Saving RequestLimiter status to disk...


In [12]:
from bs4utils import get_ith_table, read_ith_table
from bs4.element import Tag
import pandas as pd
from typing import Tuple, Any

In [13]:
def get_team_names(soup : BeautifulSoup) -> Tuple[str, str]:
    tm1 : str = soup.find_all('strong')[1].text.strip()
    tm2 : str = soup.find_all('strong')[2].text.strip()
    return tm1, tm2


def check_players(players : List[Tuple[Any,...]]) -> None:
    for p in players:
        assert p[0] == p[21]
        

def get_all_info(soup : BeautifulSoup):
    """"
    Returns
        Tuple with 4 items
            - tm1_tuple, tm1_player_list (tuples)
            - tm2_tuple, tm2_player_list (tuples)
    """
    tm1, tm2 = get_team_names(soup)
    tm1_inds = [0, 7] 
    tm2_inds = [8, 15]
    
    tm1_tup, tm1_players = process_team_tables(soup, tm1_inds)
    tm2_tup, tm2_players = process_team_tables(soup, tm2_inds)
    check_players(tm1_players)
    check_players(tm2_players)

    # Extra 0/1 represents away vs home
    tm1_tup = (tm1, '0') + tm1_tup
    tm2_tup = (tm2, '1') + tm2_tup

    return tm1_tup, tm1_players, tm2_tup, tm2_players


def process_team_tables(soup : BeautifulSoup, inds : List[int]):
    tmTup = ()
    playerTups : List[Tuple[Any, ...]] = []
    for ind in inds:
        df = read_ith_table(soup, ind)
        df.columns = df.columns.droplevel()
        tmTup += _get_tm_info(df)
        playerTups = _get_player_tuples(df, playerTups)
    return tmTup, playerTups


def _get_tm_info(df : pd.DataFrame) -> Tuple[Any, ...]:
    last_row = df.iloc[-1, :]
    assert last_row['Starters'] == 'Team Totals'
    tup = tuple(list(last_row)[2:-1])
    return tup


def _get_player_tuples(df : pd.DataFrame(), playerTups : List[Tuple[Any,...]]) -> Tuple[Any, ...]:
    player_df = df[~df['MP'].isin(['Did Not Play', '240', 'MP'])].reset_index()
    for num, row in player_df.iterrows():
        player_tup : Tuple[Any,...] = tuple(list(row)[1:])
        if len(playerTups) <= num:
            playerTups.append(player_tup)
        else:
            playerTups[num] = playerTups[num] + player_tup
    return playerTups

In [14]:
def process_time(tim : str):
    tim_list = tim.split(':')
    
    tim_list[1] = tim_list[1][:-1]
    if 'p' in tim:
        tim_list[0] = int(tim_list[0]) + 12
    tim_list.append('00')
    tim_str = ':'.join([str(i) for i in tim_list])
    return tim_str

for month in MONTHS[:1]:
    link = schedule_base.format(YEAR, month)
    df = learn_schedule_from_month(link, rl)
    # print(df.head())
    ctr = 0
    for num, row in df.iterrows():
        link = BASE + row['game_link']
        br.set_link(link)
        soup : BeautifulSoup  = br.get_soup()
        tm1_tuple, tm1_players, tm2_tuple,tm2_players = get_all_info(soup)
        
        game_info = (row['Date'], process_time(row['Start (ET)']), str(row['Attend.']), row['Arena']) 
        game_entry1 = game_info + tm1_tuple + tm2_tuple
        game_entry2 = game_info + tm2_tuple + tm1_tuple
        
        dao.team_box_to_db((game_entry1, game_entry2))
        # print(tm2_tuple)
        ctr += 1
        if ctr > 0:
            break

Successfully processed append to queue...
Size of current queue... 3
Saving RequestLimiter status to disk...
Successfully processed append to queue...
Size of current queue... 4
Saving RequestLimiter status to disk...
INSERT INTO team_box VALUES ('Tue, Oct 18, 2022','19:30:00','19156','TD Garden','Philadelphia 76ers','0','40','80','.500','13','34','.382','24','28','.857','4','27','31','16','8','3','14','25','117','.634','.581','.425','.350','11.8','81.8','46.3','40.0','8.1','6.4','13.2','100.0','119.2','128.3','Boston Celtics','1','46','82','.561','12','35','.343','22','28','.786','6','30','36','24','8','3','10','24','126','.668','.634','.427','.341','18.2','88.2','53.7','52.2','8.1','6.5','9.6','100.0','128.3','119.2'),('Tue, Oct 18, 2022','19:30:00','19156','TD Garden','Boston Celtics','1','46','82','.561','12','35','.343','22','28','.786','6','30','36','24','8','3','10','24','126','.668','.634','.427','.341','18.2','88.2','53.7','52.2','8.1','6.5','9.6','100.0','128.3','119.2','Phil

In [7]:
tm1_tuple, tm1_players, tm2_tuple,tm2_players = get_all_info(soup)
        
game_info = (row['Date'], process_time(row['Start (ET)']), str(row['Attend.']), row['Arena']) 
game_entry1 = game_info + tm1_tuple + tm2_tuple
game_entry2 = game_info + tm2_tuple + tm1_tuple
print(game_entry1)

('Tue, Oct 18, 2022', '19:30:00', '19156', 'TD Garden', 'Philadelphia 76ers', '0', '40', '80', '.500', '13', '34', '.382', '24', '28', '.857', '4', '27', '31', '16', '8', '3', '14', '25', '117', '.634', '.581', '.425', '.350', '11.8', '81.8', '46.3', '40.0', '8.1', '6.4', '13.2', '100.0', '119.2', '128.3', 'Boston Celtics', '1', '46', '82', '.561', '12', '35', '.343', '22', '28', '.786', '6', '30', '36', '24', '8', '3', '10', '24', '126', '.668', '.634', '.427', '.341', '18.2', '88.2', '53.7', '52.2', '8.1', '6.5', '9.6', '100.0', '128.3', '119.2')


In [8]:
args = dao.cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,"\
                    "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,"\
                    "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,"\
                    "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,"\
                    "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,"\
                    "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,"\
                    "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"\
                    ")",
                    game_entry1).decode('utf-8')
qry = "INSERT INTO team_box VALUES " + (args) + " ON CONFLICT "\
                "(game_date, game_time, tm1, tm2) DO NOTHING"
dao._try_insertion(qry, 'team_box')

Committed team_box insertion!


In [52]:
# cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"\
#                         "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"\
#                         "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"\
#                         "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"\
#                         "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"\
#                         "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"\
#                         "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
#                         i).decode('utf-8') 


"('Tue, Oct 18, 2022','7:30p','19156','TD Garden','Philadelphia 76ers',0,'40','80','.500','13''34','.382','24','28','.857','4','27','31','16','8''3','14','25','117','.634','.581','.425','.350','11.8','81.8''46.3','40.0','8.1','6.4','13.2','100.0','119.2','128.3','Boston Celtics',1'46','82','.561','12','35','.343','22','28','.786','6''30','36','24','8','3','10','24','126','.668','.634''.427','.341','18.2','88.2','53.7','52.2','8.1','6.5','9.6','100.0','128.3','119.2')"

In [104]:
# a.columns = a.columns.droplevel()
a.iloc[-1, :]

Starters    Team Totals
MP                  240
TS%                .668
eFG%               .634
3PAr               .427
FTr                .341
ORB%               18.2
DRB%               88.2
TRB%               53.7
AST%               52.2
STL%                8.1
BLK%                6.5
TOV%                9.6
USG%              100.0
ORtg              128.3
DRtg              119.2
BPM                 NaN
Name: 16, dtype: object

In [15]:
read_ith_table(soup, 0, id = 'div_line_score')

No table found on this HTML page!


In [13]:
soup.find_all('table', id = 'line_score')

[]