In [1]:
import os
import sys
from datetime import datetime, timezone, timedelta
from urllib.request import urlopen
from urllib.error import HTTPError
import logging
from bs4 import BeautifulSoup
from sqlalchemy import exc, create_engine
import pymysql
import numpy as np
import pandas as pd
import boto3
from botocore.exceptions import ClientError
# import pyarrow
import awswrangler as wr
# from urllib.error import URLError, HTTPError


logging.basicConfig(filename='example.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info('Starting Logging Function')

Tomorrow = datetime.now().date()
yesterday = Tomorrow - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year
season_type = 'Regular Season'

In [8]:
def get_boxscores(month = month, day = day, year = year):
    url = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={}&day={}&year={}&type=all".format(month, day, year)
    try: 
        html = urlopen(url)
        soup = BeautifulSoup(html, "html.parser")
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers = headers[1:]
        headers[1] = 'Team'
        headers[2] = "Location"
        headers[3] = 'Opponent'
        headers[4] = "Outcome"
        headers[6] = "FGM"
        headers[8] = "FGPercent"
        headers[9] = "threePFGMade"
        headers[10] = "threePAttempted"
        headers[11] = "threePointPercent"
        headers[14] = "FTPercent"
        headers[15] = "OREB"
        headers[16] = "DREB"
        headers[24] = 'PlusMinus'

        rows = soup.findAll('tr')[1:]
        player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

        df = pd.DataFrame(player_stats, columns = headers)
        df[['FGM', 'FGA', 'FGPercent', 'threePFGMade', 'threePAttempted', 'threePointPercent', 'OREB', 'DREB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PlusMinus', 'GmSc']] = df[['FGM', 'FGA', 'FGPercent', 'threePFGMade', 'threePAttempted', 'threePointPercent','OREB', 'DREB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PlusMinus', 'GmSc']].apply(pd.to_numeric)
        df['date'] = str(year) + '-' + str(month) + '-' + str(day)
        df['date'] = pd.to_datetime(df['date'])
        df['Type'] = season_type
        df['Season'] = 2022
        df['Location'] = df['Location'].apply(lambda x: 'A' if x == '@' else 'H')
        df['Team'] = df['Team'].str.replace("PHO", "PHX")
        df['Team'] = df['Team'].str.replace("CHO", "CHA")
        df['Team'] = df['Team'].str.replace("BRK", "BKN")
        df['Opponent'] = df['Opponent'].str.replace("PHO", "PHX")
        df['Opponent'] = df['Opponent'].str.replace("CHO", "CHA")
        df['Opponent'] = df['Opponent'].strs.replace("BRK", "BKN")
        df = df.query('Player == Player').reset_index()
        df['Player'] = df['Player'].str.normalize('NFKD').str.encode('ascii', errors = 'ignore').str.decode('utf-8')
        df.columns = df.columns.str.lower()
        df = df.drop('index', axis =1)
        logging.info(f'Box Score Function Successful, retrieving {len(df)} rows for {year}-{month}-{day}')
        print(f'Box Score Function Successful, retrieving {len(df)} rows for {year}-{month}-{day}')
        return(df)
    except BaseException as error:
        logging.info(f"Box Score Function Failed, {error}, no data available for {year}-{month}-{day}")
        print(f"Box Score Function Failed, {error}, no data available for {year}-{month}-{day}")
        df = []
        return(df)

In [10]:
df= 5

In [11]:
df = get_boxscores()

Box Score Function Failed, 'Series' object has no attribute 'strs', no data available for 2021-10-30


In [11]:
def get_odds():
    """
    Web Scrape function w/ pandas read_html that grabs current day's nba odds

    Args:
        None

    Returns:
        Pandas DataFrame of NBA moneyline + spread odds for upcoming games for that day
    """
    try:
        url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
        df = pd.read_html(url)

        data1 = df[0].copy()
        date_try = str(year) + " " + data1.columns[0]
        data1["date"] = np.where(
            date_try == "2021 Tomorrow",
            datetime.now().date(),  # if the above is true, then return this
            str(year) + " " + data1.columns[0],  # if false then return this
        )
        # )
        date_try = data1["date"].iloc[0]
        data1.columns.values[0] = "Tomorrow"
        data1.reset_index(drop=True)
        data1["Tomorrow"] = data1["Tomorrow"].str.replace(
            "LA Clippers", "LAC Clippers", regex=True
        )

        data1["Tomorrow"] = data1["Tomorrow"].str.replace("AM", "AM ", regex=True)
        data1["Tomorrow"] = data1["Tomorrow"].str.replace("PM", "PM ", regex=True)
        data1["Time"] = data1["Tomorrow"].str.split().str[0]
        data1["datetime1"] = pd.to_datetime(
            date_try.strftime("%Y-%m-%d") + " " + data1["Time"]
        ) - timedelta(hours=6) + timedelta(days = 1)

        data2 = df[1].copy()
        data2.columns.values[0] = "Tomorrow"
        data2.reset_index(drop=True)
        data2["Tomorrow"] = data2["Tomorrow"].str.replace(
            "LA Clippers", "LAC Clippers", regex=True
        )
        data2["Tomorrow"] = data2["Tomorrow"].str.replace("AM", "AM ", regex=True)
        data2["Tomorrow"] = data2["Tomorrow"].str.replace("PM", "PM ", regex=True)
        data2["Time"] = data2["Tomorrow"].str.split().str[0]
        data2["datetime1"] = (
            pd.to_datetime(date_try.strftime("%Y-%m-%d") + " " + data2["Time"])
            - timedelta(hours=6)
            + timedelta(days=1)
        )
        data2["date"] = data2["datetime1"].dt.date

        data = data1.append(data2).reset_index(drop=True)
        data["SPREAD"] = data["SPREAD"].str[:-4]
        data["TOTAL"] = data["TOTAL"].str[:-4]
        data["TOTAL"] = data["TOTAL"].str[2:]
        data["Tomorrow"] = data["Tomorrow"].str.split().str[1:2]
        data["Tomorrow"] = pd.DataFrame(
            [str(line).strip("[").strip("]").replace("'", "") for line in data["Tomorrow"]]
        )
        data["SPREAD"] = data["SPREAD"].str.replace("pk", "-1", regex=True)
        data["SPREAD"] = data["SPREAD"].str.replace("+", "", regex=True)
        data.columns = data.columns.str.lower()
        data = data[["tomorrow", "spread", "total", "moneyline", "date", "datetime1"]]
        data = data.rename(columns={data.columns[0]: "team"})
        data = data.query("date == date.min()")  # only grab games from upcoming day
        print(f"Odds Function Successful, retrieving {len(data)} rows")
        return data
    except BaseException as error:
        print(f"Odds Function Failed, {error}")
        data = []
        return data

In [12]:
df = get_odds()

Odds Function Successful, retrieving 16 rows


In [None]:
df.to_sql(con = conn, name = 'aws_odds_source', if_exists = 'append', index = False)

In [7]:
df

Unnamed: 0,team,spread,total,moneyline,date,datetime1
0,NY,1.0,212.0,-105,2021-11-08,2021-11-07 18:10:00
1,PHI,-1.0,212.0,-115,2021-11-08,2021-11-07 18:10:00
2,BKN,1.5,215.5,100,2021-11-08,2021-11-07 19:10:00
3,CHI,-1.5,215.5,-120,2021-11-08,2021-11-07 19:10:00
4,MIN,6.0,217.0,185,2021-11-08,2021-11-07 19:10:00
5,MEM,-6.0,217.0,-225,2021-11-08,2021-11-07 19:10:00
6,NO,8.5,212.5,280,2021-11-08,2021-11-07 19:40:00
7,DAL,-8.5,212.5,-365,2021-11-08,2021-11-07 19:40:00
8,MIA,-1.5,205.0,-125,2021-11-08,2021-11-07 20:10:00
9,DEN,1.5,205.0,105,2021-11-08,2021-11-07 20:10:00


In [10]:
url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
df = pd.read_html(url)

data1 = df[0].copy()
date_try = str(year) + " " + data1.columns[0]
data1["date"] = np.where(
    date_try == "2021 Tomorrow",
    datetime.now().date(),  # if the above is true, then return this
    str(year) + " " + data1.columns[0],  # if false then return this
)
# )
date_try = data1["date"].iloc[0]
data1.columns.values[0] = "Tomorrow"
data1.reset_index(drop=True)
data1["Tomorrow"] = data1["Tomorrow"].str.replace(
    "LA Clippers", "LAC Clippers", regex=True
)

data1["Tomorrow"] = data1["Tomorrow"].str.replace("AM", "AM ", regex=True)
data1["Tomorrow"] = data1["Tomorrow"].str.replace("PM", "PM ", regex=True)
data1["Time"] = data1["Tomorrow"].str.split().str[0]
data1["datetime1"] = pd.to_datetime(
    date_try.strftime("%Y-%m-%d") + " " + data1["Time"]
) - timedelta(hours=6) + timedelta(days = 1)

In [6]:
url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
df = pd.read_html(url)

data1 = df[0].copy()
data2 =df[1].copy()
data3 = df[2].copy()