In [228]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timezone, timedelta
import boto3
import pyarrow
import awswrangler as wr
import logging

logging.basicConfig(filename='example.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info('Starting Logging Function')

today = datetime.now().date()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year

In [235]:
def get_player_stats():
    year = 2021
    url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
    headers = headers[1:]

    rows = soup.findAll('tr')[1:]
    player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

    stats = pd.DataFrame(player_stats, columns = headers)
    stats['PTS'] = pd.to_numeric(stats['PTS'])
    logging.info(f'General Stats Function Successful, retrieving {len(stats)} updated rows')
    return(stats)


In [236]:
df = get_player_stats()


In [237]:
def get_boxscores(month = month, day = day, year = year):
    url = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={}&day={}&year={}&type=all".format(month, day, year)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    try: 
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers = headers[1:]
        headers[2] = "Location"
        headers[4] = "Outcome"

        rows = soup.findAll('tr')[1:]
        player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

        df = pd.DataFrame(player_stats, columns = headers)
        df[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']] = df[['FG', 'FGA', 'FG%', '3P', '3PA', '3P%', 'DRB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'GmSc']].apply(pd.to_numeric)

        df.sort_values('PTS', ascending = False)
        logging.info(f'Box Score Function Successful, retrieving {len(df)} rows for {yesterday}')
        return(df)
    except IndexError:
        logging.info(f"Box Score Function Failed, no data available for {yesterday}")
        return pd.DataFrame()
        # IndexError is what the Actual Error is called when this function failed before.

In [238]:
box_scores = get_boxscores(month = "7", day = "20", year = "2021")
# can be string or numeric parameter inputs

Box Score Function Successful, retrieving 17 rows for 2021-08-14


In [37]:
box_scores3 = get_boxscores()

Box Score Function Failed, no data available for 2021-08-14


In [239]:
def get_injuries():
    url = "https://www.basketball-reference.com/friv/injuries.fcgi"
    df = pd.read_html(url)[0]
    df = df.rename(columns = {"Update": "Date"})
    logging.info(f'Injury Function Successful, retrieving {len(df)} rows')
    return(df)
# not sure how this would fail, but if pandas read html stops working then go back to old script and do it manually

In [240]:
injury_data = get_injuries()

In [54]:
def get_transactions():
    url = "https://www.basketball-reference.com/leagues/NBA_2021_transactions.html"
    html = urlopen(url)
    soup = BeautifulSoup(html)
    trs = soup.findAll('li')[71:] # theres a bunch of garbage in the first 71 rows - no matter what 
    rows = []
    mylist = []
    for tr in trs:
        date = tr.find('span')
        if date is not None: # needed bc span can be null (multi <p> elements per span)
            date = date.text
        data = tr.findAll('p')
        for p in data:
            mylist.append(p.text)
        data3 = [date] + [mylist]
        rows.append(data3)
        mylist = []

    transactions = pd.DataFrame(rows)
    transactions.columns = ['Date', 'Transaction']
    transactions = transactions.explode('Transaction')
    transactions['Date'] = pd.to_datetime(transactions['Date'])
    transactions = transactions.query('Date != "NaN"')
    transactions
    logging.info(f'Transactions Function Successful, retrieving {len(transactions)} rows')
    return(transactions)

In [55]:
transactions = get_transactions()

Transactions Function Successful, retrieving 823 rows


In [57]:
# maybe add a filter or something to only grab future games
schedule_df = pd.DataFrame()
def schedule_scraper(month):
    global schedule_df
    url = "https://www.basketball-reference.com/leagues/NBA_2021_games-{}.html".format(month)
    html = urlopen(url)
    soup = BeautifulSoup(html)

    headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]

    headers[6] = 'boxScoreLink'
    headers[7] = 'isOT'
    headers = headers[1:]

    rows = soup.findAll('tr')[1:]
    date_info = [[th.getText() for th in rows[i].findAll('th')]
            for i in range(len(rows))]

    game_info = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]
    date_info = [i[0] for i in date_info]

    schedule = pd.DataFrame(game_info, columns = headers)
    schedule['Date'] = date_info
    
    # join_df = join_df.append(schedule)
    schedule_df = schedule_df.append(schedule)
    # return(join_df)
    logging.info(f'Schedule Function Completed, retrieving {len(schedule_df)} rows')

In [63]:
month_list = ['december', 'january', 'february', 'march', 'april', 'may', 'june', 'july']
schedule_df = pd.DataFrame()
for month in month_list:
    schedule_scraper(month)

Schedule Function Completed, retrieving 67 rows
Schedule Function Completed, retrieving 289 rows
Schedule Function Completed, retrieving 501 rows
Schedule Function Completed, retrieving 705 rows
Schedule Function Completed, retrieving 945 rows
Schedule Function Completed, retrieving 1118 rows
Schedule Function Completed, retrieving 1163 rows
Schedule Function Completed, retrieving 1171 rows


In [74]:
def get_advanced_stats():
    url = "https://www.basketball-reference.com/leagues/NBA_2021.html"
    df = pd.read_html(url)
    df = pd.DataFrame(df[10])
    df.drop(columns = df.columns[0], 
        axis=1, 
        inplace=True)

    df.columns = ['Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORTG', 'DRTG', 'NRTG', 'Pace', 'FTr', '3PAr', 'TS%', 'bby1', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'bby2', 'eFG%_opp', 'TOV%_opp', 'DRB%_opp', 'FT/FGA_opp', 'bby3', 'Arena', 'Attendance', 'Att/Game']
    df.drop(['bby1', 'bby2', 'bby3'], axis = 1, inplace = True)
    df = df.query('Team != "League Average"')
    logging.info(f'Advanced Stats Function Successful, retrieving updated data for 30 rows')
    return(df)

In [72]:
advanced_stats = get_advanced_stats()

In [224]:
def get_odds():
    url = "https://sportsbook.draftkings.com/leagues/basketball/88673861?category=game-lines&subcategory=game"
    df = pd.read_html(url)
    data1 = df[0]
    data2 = df[1]
    data2 = data2.rename(columns = {"Tomorrow": "Today"})
    data = data1.append(data2)
    data
    data['SPREAD'] = data['SPREAD'].str[:-4]
    data['TOTAL'] = data['TOTAL'].str[:-4]
    data['TOTAL'] = data['TOTAL'].str[2:]
    data.reset_index(drop = True)
    data

    data['Today'] = data['Today'].str.replace("AM|PM", " ")
    data['Today'] = data['Today'].str.split().str[1:2]
    data['Today'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in data['Today']])
    data = data.rename(columns = {"Today": "team", "SPREAD": "spread", "TOTAL": "total_pts", "MONEYLINE": "moneyline"})
    logging.info(f'Odds Function Successful, retrieving {len(data)} rows')
    return(data)

In [225]:
odds = get_odds()

Odds Function Successful, retrieving 16 rows


  data['Today'] = data['Today'].str.replace("AM|PM", " ")


In [226]:
odds

Unnamed: 0,team,spread,total_pts,moneyline
0,QuarterDEN,,,185
1,QuarterDAL,,,-245
2,ATL,-1.0,166.5,-120
3,MIA,1.0,166.5,100
4,PHI,2.5,172.5,110
5,BOS,-2.5,172.5,-130
6,IND,-4.5,164.5,-180
7,OKC,4.5,164.5,155
8,CHA,5.0,170.5,165
9,TOR,-5.0,170.5,-195
