In [3]:
import os
import sys
from datetime import datetime, timezone, timedelta
from urllib.request import urlopen
import logging
from bs4 import BeautifulSoup
from sqlalchemy import exc, create_engine
import pymysql
import numpy as np
import pandas as pd
import boto3
from botocore.exceptions import ClientError

logging.basicConfig(filename='example.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info('Starting Logging Function')

today = datetime.now().date()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year
season_type = 'Regular Season'


In [None]:
# version: 2.0.9-dev0-dev0-dev0

In [2]:
def sql_connection():
    try:
        connection = create_engine('postgresql+psycopg2://' + os.environ.get('RDS_USER') + ':' + os.environ.get('RDS_PW') + '@' + os.environ.get('IP') + ':' + '5432' + '/' + os.environ.get('RDS_DB'),
                                    connect_args = {'options': '-csearch_path=nba_source'}, # defining schema to connect to
                     echo = False)
        logging.info('SQL Connection Successful')
        print('SQL Connection Successful')
        return(connection)
    except exc.SQLAlchemyError as e:
        logging.info('SQL Connection Failed, Error:', e)
        print('SQL Connection Failed, Error:', e)
        return(e)

In [8]:
"""
Fixture to load pbp data from a csv file for testing.
"""
fname = 'new_test/tests/fixture_csvs/pbp_data.csv'
df = pd.read_csv(fname)
yesterday = datetime.now().date() - timedelta(1)
yesterday_hometeams = (
    df.query('location == "H"')[["team"]].drop_duplicates().dropna()
)
yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
    "PHX", "PHO"
)
yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
    "CHA", "CHO"
)
yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
    "BKN", "BRK"
)

away_teams = (
    df.query('location == "A"')[["team", "opponent"]].drop_duplicates().dropna()
)
away_teams = away_teams.rename(
    columns={
        away_teams.columns[0]: "AwayTeam",
        away_teams.columns[1]: "HomeTeam",
    }
)
newdate = str(
    df["date"].drop_duplicates()[0]
)  # this assumes all games in the boxscores df are 1 date
newdate = pd.to_datetime(newdate).strftime(
    "%Y%m%d"
) 

In [6]:
def get_player_stats():
    try:
        year = 2022
        url = "https://www.basketball-reference.com/leagues/NBA_{}_per_game.html".format(year)
        html = urlopen(url)
        soup = BeautifulSoup(html)

        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers = headers[1:]

        rows = soup.findAll('tr')[1:]
        player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

        stats = pd.DataFrame(player_stats, columns = headers)
        stats['PTS'] = pd.to_numeric(stats['PTS'])
        logging.info(f'General Stats Function Successful, retrieving {len(stats)} updated rows')
        print(f'General Stats Function Successful, retrieving {len(stats)} updated rows')
        return(stats)
    except IndexError:
        logging.info("General Stats Function Failed for Today's Games")
        print("General Stats Function Failed for Today's Games")
        df = []
        return(df)

In [29]:
subreddit = reddit.subreddit(sub)
posts = []
for post in subreddit.hot(limit=27):
    posts.append(
        [
            post.title,
            post.score,
            post.id,
            post.url,
            post.num_comments,
            post.selftext,
            today,
            todaytime,
        ]
    )
posts = pd.DataFrame(
    posts,
    columns=[
        "title",
        "score",
        "id",
        "url",
        "num_comments",
        "body",
        "scrape_date",
        "scrape_time",
    ],
)
posts.columns = posts.columns.str.lower()

In [30]:
df1.to_csv('pbp_data.csv', index = False)

In [7]:
df = get_player_stats()

General Stats Function Failed for Today's Games


In [3]:
conn = sql_connection()

SQL Connection Successful


In [32]:
def get_boxscores(month=month, day=day, year=year):
    url = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={}&day={}&year={}&type=all".format(
        month, day, year
    )
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")

    try:
        headers = [th.getText() for th in soup.findAll("tr", limit=2)[0].findAll("th")]
        headers = headers[1:]
        headers[1] = "Team"
        headers[2] = "Location"
        headers[3] = "Opponent"
        headers[4] = "Outcome"
        headers[6] = "FGM"
        headers[8] = "FGPercent"
        headers[9] = "threePFGMade"
        headers[10] = "threePAttempted"
        headers[11] = "threePointPercent"
        headers[14] = "FTPercent"
        headers[15] = "OREB"
        headers[16] = "DREB"
        headers[24] = "PlusMinus"

        rows = soup.findAll("tr")[1:]
        player_stats = [
            [td.getText() for td in rows[i].findAll("td")] for i in range(len(rows))
        ]

        df = pd.DataFrame(player_stats, columns=headers)
        df[
            [
                "FGM",
                "FGA",
                "FGPercent",
                "threePFGMade",
                "threePAttempted",
                "threePointPercent",
                "OREB",
                "DREB",
                "TRB",
                "AST",
                "STL",
                "BLK",
                "TOV",
                "PF",
                "PTS",
                "PlusMinus",
                "GmSc",
            ]
        ] = df[
            [
                "FGM",
                "FGA",
                "FGPercent",
                "threePFGMade",
                "threePAttempted",
                "threePointPercent",
                "OREB",
                "DREB",
                "TRB",
                "AST",
                "STL",
                "BLK",
                "TOV",
                "PF",
                "PTS",
                "PlusMinus",
                "GmSc",
            ]
        ].apply(
            pd.to_numeric
        )
        df["date"] = str(year) + "-" + str(month) + "-" + str(day)
        df["date"] = pd.to_datetime(df["date"])
        df["Type"] = season_type
        df["Season"] = 2022
        df["Location"] = df["Location"].apply(lambda x: "A" if x == "@" else "H")
        df["Team"] = df["Team"].str.replace("PHO", "PHX")
        df["Team"] = df["Team"].str.replace("CHO", "CHA")
        df["Team"] = df["Team"].str.replace("BRK", "BKN")
        df["Opponent"] = df["Opponent"].str.replace("PHO", "PHX")
        df["Opponent"] = df["Opponent"].str.replace("CHO", "CHA")
        df["Opponent"] = df["Opponent"].str.replace("BRK", "BKN")
        df = df.query("Player == Player").reset_index(drop=True)
        df["Player"] = (
            df["Player"]
            .str.normalize("NFKD")
            .str.encode("ascii", errors="ignore")
            .str.decode("utf-8")
        )
        df.columns = df.columns.str.lower()
        logging.info(
            f"Box Score Function Successful, retrieving {len(df)} rows for {year}-{month}-{day}"
        )
        print(
            f"Box Score Function Successful, retrieving {len(df)} rows for {year}-{month}-{day}"
        )
        return df
    except IndexError:
        logging.info(
            f"Box Score Function Failed, no data available for {year}-{month}-{day}"
        )
        print(f"Box Score Function Failed, no data available for {year}-{month}-{day}")
        df = []
        return df

In [34]:
df1 = get_boxscores()

Box Score Function Successful, retrieving 171 rows for 2022-10-23


In [5]:
url = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={}&day={}&year={}&type=all".format(month, day, year)
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")


headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
headers = headers[1:]
headers[1] = 'Team'
headers[2] = "Location"
headers[3] = 'Opponent'
headers[4] = "Outcome"
headers[6] = "FGM"
headers[8] = "FGPercent"
headers[9] = "threePFGMade"
headers[10] = "threePAttempted"
headers[11] = "threePointPercent"
headers[14] = "FTPercent"
headers[15] = "OREB"
headers[16] = "DREB"
headers[24] = 'PlusMinus'

rows = soup.findAll('tr')[1:]
player_stats = [[td.getText() for td in rows[i].findAll('td')]
    for i in range(len(rows))]

df = pd.DataFrame(player_stats, columns = headers)
df[['FGM', 'FGA', 'FGPercent', 'threePFGMade', 'threePAttempted', 'threePointPercent', 'OREB', 'DREB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PlusMinus', 'GmSc']] = df[['FGM', 'FGA', 'FGPercent', 'threePFGMade', 'threePAttempted', 'threePointPercent','OREB', 'DREB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PlusMinus', 'GmSc']].apply(pd.to_numeric)
df['date'] = str(year) + '-' + str(month) + '-' + str(day)
df['date'] = pd.to_datetime(df['date'])
df['Type'] = season_type
df['Season'] = 2022
df['Location'] = df['Location'].apply(lambda x: 'A' if x == '@' else 'H')
df['Team'] = df['Team'].str.replace("PHO", "PHX")
df['Team'] = df['Team'].str.replace("CHO", "CHA")
df['Team'] = df['Team'].str.replace("BRK", "BKN")
df['Opponent'] = df['Opponent'].str.replace("PHO", "PHX")
df['Opponent'] = df['Opponent'].str.replace("CHO", "CHA")
df['Opponent'] = df['Opponent'].str.replace("BRK", "BKN")
df.columns = df.columns.str.lower()

IndexError: list index out of range

In [10]:
df2 = get_boxscores(month = month, day = day, year = year)

Box Score Function Failed, no data available for 2021-10-5


In [14]:
df1.to_sql(con = conn, name = 'aws_boxscores_source', if_exists = 'replace', index = False)

In [None]:
schedule_df.to_sql(con = conn, name = "aws_schedule_source", if_exists = 'replace')

In [3]:
df = get_boxscores()

Box Score Function Failed, no data available for 2021-09-03


In [222]:
yesterday_hometeams = df.query('Location == "H"')[['Team']].drop_duplicates().dropna()
yesterday_hometeams['Team'] = yesterday_hometeams['Team'].str.replace("PHX", "PHO")
yesterday_hometeams['Team'] = yesterday_hometeams['Team'].str.replace("CHA", "CHO")
yesterday_hometeams['Team'] = yesterday_hometeams['Team'].str.replace("BKN", "BRK")

In [129]:
# work on this tmmw 2020 12 23 has like 12 games so try that
url = "https://www.basketball-reference.com/boxscores/pbp/202012220BRK.html"
df = pd.read_html(url)[0]
df = df.droplevel(0, axis = 'columns')
df = df.rename(columns={df.columns[1]: 'Away', df.columns[2]: 'AwayScore', df.columns[4]: 'HomeScore', df.columns[5]: 'Home'})
df

Unnamed: 0,Time,Away,AwayScore,Score,HomeScore,Home
0,12:00.0,Jump ball: J. Wiseman vs. D. Jordan (J. Harris...,Jump ball: J. Wiseman vs. D. Jordan (J. Harris...,Jump ball: J. Wiseman vs. D. Jordan (J. Harris...,Jump ball: J. Wiseman vs. D. Jordan (J. Harris...,Jump ball: J. Wiseman vs. D. Jordan (J. Harris...
1,11:50.0,,,0-0,,Turnover by D. Jordan (bad pass)
2,11:38.0,Shooting foul by K. Irving (drawn by S. Curry),,0-0,,
3,11:38.0,S. Curry makes free throw 1 of 2,+1,1-0,,
4,11:38.0,S. Curry makes free throw 2 of 2,+1,2-0,,
...,...,...,...,...,...,...
517,0:30.0,,,96-125,,Turnover by B. Brown (bad pass)
518,0:10.0,J. Wiseman misses 2-pt jump shot from 9 ft,,96-125,,
519,0:08.0,Offensive rebound by D. Lee,,96-125,,
520,0:07.0,M. Mulder makes 3-pt jump shot from 26 ft (ass...,+3,99-125,,


In [11]:
newdate = yesterday.strftime("%Y%m%d")
pracdate = '20201223'

In [4]:
def get_pbp_data(df):
    """
    Web Scrape function w/ pandas read_html that uses boxscore team aliases to scrape the pbp data interactively for each game played the previous day

    Args:
        None

    Returns:
        All PBP Data for the games returned in the Box Scores functions

    """
    if len(df) > 0:
        yesterday_hometeams = (
            df.query('location == "H"')[["team"]].drop_duplicates().dropna()
        )
        yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
            "PHX", "PHO"
        )
        yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
            "CHA", "CHO"
        )
        yesterday_hometeams["team"] = yesterday_hometeams["team"].str.replace(
            "BKN", "BRK"
        )

        away_teams = (
            df.query('location == "A"')[["team", "opponent"]].drop_duplicates().dropna()
        )
        away_teams = away_teams.rename(
            columns={
                away_teams.columns[0]: "AwayTeam",
                away_teams.columns[1]: "HomeTeam",
            }
        )
    else:
        yesterday_hometeams = []

    if len(yesterday_hometeams) > 0:
        try:
            newdate = str(df['date'].drop_duplicates()[0].date()) # this assumes all games in the boxscores df are 1 date 
            newdate = pd.to_datetime(newdate).strftime("%Y%m%d") # formatting into url format.
            pbp_list = pd.DataFrame()
            for i in yesterday_hometeams["team"]:
                url = "https://www.basketball-reference.com/boxscores/pbp/{}0{}.html".format(
                    newdate, i
                )
                df = pd.read_html(url)[0]
                df.columns = df.columns.map("".join)
                df = df.rename(
                    columns={
                        df.columns[0]: "Time",
                        df.columns[1]: "descriptionPlayVisitor",
                        df.columns[2]: "AwayScore",
                        df.columns[3]: "Score",
                        df.columns[4]: "HomeScore",
                        df.columns[5]: "descriptionPlayHome",
                    }
                )
                conditions = [
                    (
                        df["HomeScore"].str.contains("Jump ball:", na=False)
                        & df["Time"].str.contains("12:00.0")
                    ),
                    (df["HomeScore"].str.contains("Start of 2nd quarter", na=False)),
                    (df["HomeScore"].str.contains("Start of 3rd quarter", na=False)),
                    (df["HomeScore"].str.contains("Start of 4th quarter", na=False)),
                    (df["HomeScore"].str.contains("Start of 1st overtime", na=False)),
                    (df["HomeScore"].str.contains("Start of 2nd overtime", na=False)),
                    (df["HomeScore"].str.contains("Start of 3rd overtime", na=False)),
                    (df["HomeScore"].str.contains("Start of 4th overtime", na=False)), # if more than 4 ots then rip
                ]
                values = [
                    "1st Quarter",
                    "2nd Quarter",
                    "3rd Quarter",
                    "4th Quarter",
                    "1st OT",
                    "2nd OT",
                    "3rd OT",
                    "4th OT",
                ]
                df["Quarter"] = np.select(conditions, values, default=None)
                df["Quarter"] = df["Quarter"].fillna(method="ffill")
                df = df.query(
                    'Time != "Time" & Time != "2nd Q" & Time != "3rd Q" & Time != "4th Q" & Time != "1st OT" & Time != "2nd OT" & Time != "3rd OT" & Time != "4th OT"'
                ).copy() # use COPY to get rid of the fucking goddamn warning bc we filtered stuf out
                         # anytime you filter out values w/o copying and run code like the lines below it'll throw a warning.
                df["HomeTeam"] = i
                df["HomeTeam"] = df["HomeTeam"].str.replace("PHO", "PHX")
                df["HomeTeam"] = df["HomeTeam"].str.replace("CHO", "CHA")
                df["HomeTeam"] = df["HomeTeam"].str.replace("BRK", "BKN")
                df = df.merge(away_teams)
                df[["scoreAway", "scoreHome"]] = df["Score"].str.split("-", expand=True)
                df["scoreAway"] = pd.to_numeric(df["scoreAway"], errors="coerce")
                df["scoreAway"] = df["scoreAway"].fillna(method="ffill")
                df["scoreAway"] = df["scoreAway"].fillna(0)
                df["scoreHome"] = pd.to_numeric(df["scoreHome"], errors="coerce")
                df["scoreHome"] = df["scoreHome"].fillna(method="ffill")
                df["scoreHome"] = df["scoreHome"].fillna(0)
                df["marginScore"] = df["scoreHome"] - df["scoreAway"]
                df["Date"] = yesterday
                df = df.rename(
                    columns={
                        df.columns[0]: "timeQuarter",
                        df.columns[6]: "numberPeriod",
                    }
                )
                pbp_list = pbp_list.append(df)
                df = pd.DataFrame()
            pbp_list.columns = pbp_list.columns.str.lower()
            pbp_list = pbp_list.query('(awayscore.notnull()) | (homescore.notnull())', engine = 'python')
            # filtering only scoring plays here, keep other all other rows in future for lineups stuff etc.
            return pbp_list
        except ValueError:
            logging.info("PBP Function Failed for Yesterday's Games")
            print("PBP Function Failed for Yesterday's Games")
            df = []
            return df
    else:
        df = []
        logging.info("PBP Function No Data Yesterday")
        print("PBP Function No Data Yesterday")
        return df


In [39]:
# df = get_pbp_data(df1)
newdatetry = str(df1['date'].drop_duplicates()[0].date())
newdatetry = pd.to_datetime(newdatetry).strftime("%Y%m%d")

In [None]:
yesterday.strftime("%Y%m%d")

In [13]:
df1['newdate'] = str(year) + '-' + str(month) + '-' + str(day)
df1['newdate'] = pd.to_datetime(df1['newdate'])

In [65]:
away_teams = df1.query('Location == "A"')[['Team', 'Opponent']].drop_duplicates().dropna()
away_teams = away_teams.rename(columns = {away_teams.columns[0]: 'AwayTeam', away_teams.columns[1]: 'HomeTeam'})

In [469]:
del df

In [8]:
pbp_data = get_pbp_data(df1)
# df['Team'] = df['Team'].str.replace("BRK", "BKN")

In [10]:
pbp_data.to_sql(con = conn, name = "aws_pbp_data_source", index = False, if_exists = 'replace')

In [54]:
pbp3 = pbp_data.query('(awayscore.notnull()) | (homescore.notnull())', engine = 'python')

In [527]:

pbp_data.to_clipboard()

In [263]:
url = "https://www.basketball-reference.com/boxscores/pbp/202012230MEM.html"
df = pd.read_html(url)[0]
df.columns = df.columns.map(''.join)
df = df.rename(columns={df.columns[0]: 'Time', df.columns[1]: 'Away', df.columns[2]: 'AwayScore', df.columns[3]: 'Score', df.columns[4]: 'HomeScore', df.columns[5]: 'Home'})
df = df.query('Time != "Time" & Time != "2nd Q" & Time != "3rd Q" & Time != "4th Q"')
df['Time'] = df['Time'].astype(str).str[:-3]
df.to_clipboard()


In [513]:
pbp_data['marginScore'] = pbp_data['scoreHome'] - pbp_data['scoreAway']

In [511]:
pbp_data

Unnamed: 0,Time,descriptionPlayVisitor,AwayScore,Score,HomeScore,descriptionPlayHome,Quarter,HomeTeam,AwayTeam,scoreAway,scoreHome,Date,marginScore
0,12:00.0,Jump ball: T. Thompson vs. K. Durant (M. Smart...,Jump ball: T. Thompson vs. K. Durant (M. Smart...,Jump ball: T. Thompson vs. K. Durant (M. Smart...,Jump ball: T. Thompson vs. K. Durant (M. Smart...,Jump ball: T. Thompson vs. K. Durant (M. Smart...,1st Quarter,BKN,BOS,0.0,0.0,2021-09-04,0.0
1,11:43.0,M. Smart misses 3-pt jump shot from 26 ft,,0-0,,,1st Quarter,BKN,BOS,0.0,0.0,2021-09-04,0.0
2,11:40.0,,,0-0,,Defensive rebound by K. Irving,1st Quarter,BKN,BOS,0.0,0.0,2021-09-04,0.0
3,11:28.0,,,0-0,,K. Irving misses 3-pt jump shot from 25 ft (bl...,1st Quarter,BKN,BOS,0.0,0.0,2021-09-04,0.0
4,11:24.0,Defensive rebound by T. Thompson,,0-0,,,1st Quarter,BKN,BOS,0.0,0.0,2021-09-04,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
465,0:18.0,,,85-115,,J. Carter misses free throw 2 of 2,4th Quarter,PHX,LAL,85.0,115.0,2021-09-04,-30.0
466,0:15.0,Defensive rebound by T. Horton-Tucker,,85-115,,,4th Quarter,PHX,LAL,85.0,115.0,2021-09-04,-30.0
467,0:05.0,T. Horton-Tucker misses 3-pt jump shot from 34 ft,,85-115,,,4th Quarter,PHX,LAL,85.0,115.0,2021-09-04,-30.0
468,0:03.0,,,85-115,,Defensive rebound by J. Carter,4th Quarter,PHX,LAL,85.0,115.0,2021-09-04,-30.0


In [301]:
pbp_data2.head(15)
pbp_data2['Time'][0]

'12:00'

In [419]:
pbp_data.to_sql(con = conn, name = 'pbp_prac', index = False, if_exists = 'replace')

In [265]:
df.index.nlevels
df.columns.nlevels
df['Time'].describe

<bound method NDFrame.describe of 0      12:0
1      11:4
2      11:4
3      11:4
4      11:4
       ... 
474     0:2
475     0:2
476     0:2
477     0:0
478     0:0
Name: Time, Length: 473, dtype: object>

In [136]:
df2 = df['Team'] # df2 returns a Series with only the team column
df2 = df[['Team']] # df2 returns a dataframe with only the team column
df2['Team'] = df2['Team'].str.replace("PHO", "PHX") # df2 returns the whole df while fixing the team col
# https://www.basketball-reference.com/boxscores/202107200MIL.html
# https://www.basketball-reference.com/boxscores/pbp/202107200MIL.html

In [190]:
def get_injuries():
    try:
        url = "https://www.basketball-reference.com/friv/injuries.fcgi"
        df = pd.read_html(url)[0]
        df = df.rename(columns = {"Update": "Date"})
        df1 = df['Description'].str.split(pat = ' - ', expand = True)
        df2 = df1[0].str.split(pat = '\\(', expand = True)
        df3 = df2[1].str.rstrip(')')
        df_final = df[['Player', 'Team' ,'Date']]
        df_final['Status'] = df2[0]
        df_final['Injury'] = df3[1]
        df_final['Description'] = df1[1]
        logging.info(f'Injury Function Successful, retrieving {len(df_final)} rows')
        print(f'Injury Function Successful, retrieving {len(df_final)} rows')
        return(df_final)
    except ValueError:
        logging.info("Injury Function Failed for Today's Games")
        print("Injury Function Failed for Today's Games")
        df = []
        return(df)

In [198]:
df = get_injuries()

Injury Function Successful, retrieving 17 rows


In [6]:
def get_transactions():
    url = "https://www.basketball-reference.com/leagues/NBA_2021_transactions.html"
    html = urlopen(url)
    soup = BeautifulSoup(html)
    trs = soup.findAll('li')[71:] # theres a bunch of garbage in the first 71 rows - no matter what 
    rows = []
    mylist = []
    for tr in trs:
        date = tr.find('span')
        if date is not None: # needed bc span can be null (multi <p> elements per span)
            date = date.text
        data = tr.findAll('p')
        for p in data:
            mylist.append(p.text)
        data3 = [date] + [mylist]
        rows.append(data3)
        mylist = []

    transactions = pd.DataFrame(rows)
    transactions.columns = ['Date', 'Transaction']
    transactions = transactions.explode('Transaction')
    transactions['Date'] = pd.to_datetime(transactions['Date'])
    transactions = transactions.query('Date != "NaN"')
    transactions
    logging.info(f'Transactions Function Successful, retrieving {len(transactions)} rows')
    print(f'Transactions Function Successful, retrieving {len(transactions)} rows')
    return(transactions)

In [10]:
url = "https://www.basketball-reference.com/leagues/NBA_2022_transactions.html"
html = urlopen(url)
soup = BeautifulSoup(html)
trs = soup.findAll('li')[70:] # theres a bunch of garbage in the first 71 rows - no matter what 
rows = []
mylist = []
for tr in trs:
        date = tr.find('span')
        if date is not None: # needed bc span can be null (multi <p> elements per span)
            date = date.text
        data = tr.findAll('p')
        for p in data:
            mylist.append(p.text)
        data3 = [date] + [mylist]
        rows.append(data3)
        mylist = []
transactions = pd.DataFrame(rows)
transactions.columns = ['Date', 'Transaction']
transactions = transactions.query('Date == Date & Date != ""').reset_index()
transactions = transactions.explode('Transaction')
transactions['Date'] = pd.to_datetime(transactions['Date'])
transactions = transactions.query('Date != "NaN"')
transactions

Unnamed: 0,index,Date,Transaction
0,0,2021-07-29,The Los Angeles Clippers traded cash and a 202...
1,1,2021-07-30,The Memphis Grizzlies waived Jontay Porter.
1,1,2021-07-30,The Oklahoma City Thunder traded cash and a 20...
1,1,2021-07-30,The Indiana Pacers traded Georgios Kalaitzakis...
1,1,2021-07-30,The Charlotte Hornets traded a 2022 1st round ...
...,...,...,...
70,70,2021-10-14,The Memphis Grizzlies signed Ahmad Caver.
70,70,2021-10-14,The Memphis Grizzlies signed Matthew Hurt.
70,70,2021-10-14,The Memphis Grizzlies waived Romeo Weems.
70,70,2021-10-14,The Memphis Grizzlies waived Sean McDermott.


In [21]:
def schedule_scraper(month):
    try:
        global schedule_df
        url = "https://www.basketball-reference.com/leagues/NBA_2022_games-{}.html".format(month)
        html = urlopen(url)
        soup = BeautifulSoup(html)

        headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
        headers[6] = 'boxScoreLink'
        headers[7] = 'isOT'
        headers = headers[1:]

        rows = soup.findAll('tr')[1:]
        date_info = [[th.getText() for th in rows[i].findAll('th')]
                for i in range(len(rows))]

        game_info = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
        date_info = [i[0] for i in date_info]

        schedule = pd.DataFrame(game_info, columns = headers)
        schedule['Date'] = date_info

        schedule_df = schedule_df.append(schedule)
        logging.info(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
        print(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
    except ValueError:
        logging.info("Schedule Scraper Function Failed")
        print("Schedule Scraper Function Failed")
        df = []
        return(df)

month_list = ['october', 'november', 'december', 'january', 'february', 'march', 'april']
schedule_df = pd.DataFrame()
for month in month_list:
    schedule_scraper(month)

Schedule Function Completed for october, retrieving 93 rows
Schedule Function Completed for november, retrieving 318 rows
Schedule Function Completed for december, retrieving 538 rows
Schedule Function Completed for january, retrieving 765 rows
Schedule Function Completed for february, retrieving 925 rows
Schedule Function Completed for march, retrieving 1150 rows
Schedule Function Completed for april, retrieving 1230 rows


In [11]:
schedule_df = schedule_df[['Start (ET)', 'Visitor/Neutral', 'Home/Neutral', 'Date']]
schedule_df['proper_date'] = pd.to_datetime(schedule_df['Date']).dt.date
schedule_df.columns = schedule_df.columns.str.lower()
schedule_df = schedule_df.rename(columns = {"start (et)": "start_time", "visitor/neutral": "away_team", "home/neutral": "home_team"})

In [3]:
conn = sql_connection()
# schedule_df.to_sql(con = conn, name = 'aws_schedule_table', index = False, if_exists = 'append')

SQL Connection Successful


In [32]:
def get_advanced_stats():
    """
    Web Scrape function w/ pandas read_html that grabs all team advanced stats
    Args:
        None
    Returns:
        Pandas DataFrame of all current Team Advanced Stats
    """
    try:
        url = "https://www.basketball-reference.com/leagues/NBA_2021.html"
        df = pd.read_html(url)
        df = pd.DataFrame(df[10])
        df = df.drop(columns=df.columns[0], axis=1)

        df.columns = [
            "Team",
            "Age",
            "W",
            "L",
            "PW",
            "PL",
            "MOV",
            "SOS",
            "SRS",
            "ORTG",
            "DRTG",
            "NRTG",
            "Pace",
            "FTr",
            "3PAr",
            "TS%",
            "bby1",  # the bby columns are because of hierarchical html formatting - they're just blank columns
            "eFG%",
            "TOV%",
            "ORB%",
            "FT/FGA",
            "bby2",
            "eFG%_opp",
            "TOV%_opp",
            "DRB%_opp",
            "FT/FGA_opp",
            "bby3",
            "Arena",
            "Attendance",
            "Att/Game",
        ]
        df = df.drop(["bby1", "bby2", "bby3"], axis=1)
        df = df.query('Team != "League Average"').reset_index()
        # Playoff teams get a * next to them ??  fkn stupid, filter it out.
        df["Team"] = df["Team"].str.replace("*", "", regex=True)
        df.columns = df.columns.str.lower()
        logging.info(
            f"Advanced Stats Function Successful, retrieving updated data for 30 Teams"
        )
        print(
            f"Advanced Stats Function Successful, retrieving updated data for 30 Teams"
        )
        return df
    except ValueError:
        logging.info("Advanced Stats Function Failed for Today's Games")
        print("Advanced Stats Function Failed for Today's Games")
        df = []
        return df

In [33]:
df = get_advanced_stats()

Advanced Stats Function Successful, retrieving updated data for 30 Teams


In [15]:
url = "https://www.basketball-reference.com/leagues/NBA_2021.html"
df = pd.read_html(url)
df = pd.DataFrame(df[10])
df.drop(columns=df.columns[0], axis=1, inplace=True)

df.columns = [
    "Team",
    "Age",
    "W",
    "L",
    "PW",
    "PL",
    "MOV",
    "SOS",
    "SRS",
    "ORTG",
    "DRTG",
    "NRTG",
    "Pace",
    "FTr",
    "3PAr",
    "TS%",
    "bby1",  # the bby columns are because of hierarchical html formatting - they're just blank columns
    "eFG%",
    "TOV%",
    "ORB%",
    "FT/FGA",
    "bby2",
    "eFG%_opp",
    "TOV%_opp",
    "DRB%_opp",
    "FT/FGA_opp",
    "bby3",
    "Arena",
    "Attendance",
    "Att/Game",
]
df.drop(["bby1", "bby2", "bby3"], axis=1, inplace=True)
df = df.query('Team != "League Average"')
df["Team"] = df["Team"].str.replace("*", "", regex=True)
df.columns = df.columns.str.lower()
df

Unnamed: 0,team,age,w,l,pw,pl,mov,sos,srs,ortg,...,tov%,orb%,ft/fga,efg%_opp,tov%_opp,drb%_opp,ft/fga_opp,arena,attendance,att/game
0,Utah Jazz,28.5,52.0,20.0,55,17,9.25,-0.29,8.97,117.6,...,12.7,24.5,0.195,0.507,10.3,79.3,0.159,Vivint Smart Home Arena,151300.0,4203.0
1,Los Angeles Clippers,28.8,47.0,25.0,49,23,6.18,-0.16,6.02,117.6,...,12.2,22.7,0.186,0.531,11.9,79.1,0.186,STAPLES Center,13901.0,386.0
2,Phoenix Suns,26.6,51.0,21.0,49,23,5.82,-0.15,5.67,117.2,...,11.5,20.8,0.177,0.534,12.4,78.5,0.194,Phoenix Suns Arena,104027.0,2890.0
3,Milwaukee Bucks,28.1,46.0,26.0,48,24,5.89,-0.32,5.57,117.2,...,12.0,23.3,0.177,0.536,11.5,79.7,0.157,Fiserv Forum,64780.0,1799.0
4,Philadelphia 76ers,27.1,49.0,23.0,48,24,5.58,-0.31,5.28,113.2,...,12.8,23.2,0.225,0.521,13.8,78.2,0.2,Wells Fargo Center,68583.0,1905.0
5,Denver Nuggets,26.1,47.0,25.0,47,25,4.93,-0.11,4.82,117.1,...,12.1,24.7,0.176,0.545,12.9,78.9,0.2,Ball Arena,54563.0,1516.0
6,Brooklyn Nets,28.2,48.0,24.0,46,26,4.5,-0.27,4.24,118.3,...,12.2,21.4,0.208,0.531,11.1,77.3,0.187,Barclays Center,30491.0,847.0
7,Los Angeles Lakers,28.2,42.0,30.0,42,30,2.79,-0.03,2.77,109.9,...,13.6,22.5,0.2,0.526,13.7,79.7,0.184,STAPLES Center,23313.0,648.0
8,Dallas Mavericks,26.3,42.0,30.0,41,31,2.26,-0.01,2.26,115.4,...,11.1,21.1,0.189,0.534,11.5,77.8,0.197,American Airlines Center,94849.0,2635.0
9,New York Knicks,25.6,41.0,31.0,41,31,2.31,-0.18,2.13,110.6,...,11.9,21.9,0.19,0.509,11.7,78.6,0.196,Madison Square Garden (IV),42131.0,1170.0


In [8]:
df.columns = ['Team', 'Age', 'W', 'L', 'PW', 'PL', 'MOV', 'SOS', 'SRS', 'ORTG', 'DRTG', 'NRTG', 'Pace', 'FTr', '3PAr', 'TS%', 'bby1', 'eFG%', 'TOV%', 'ORB%', 'FT/FGA', 'bby2', 'eFG%_opp', 'TOV%_opp', 'DRB%_opp', 'FT/FGA_opp', 'bby3', 'Arena', 'Attendance', 'Att/Game']
df.drop(['bby1', 'bby2', 'bby3'], axis = 1, inplace = True)
df = df.query('Team != "League Average"')
df['Team'] = df['Team'].str.replace("*", "", regex = True)
df.columns = df.columns.str.lower()

In [6]:
df.to_sql(con = conn, name = "aws_adv_stats_table", index = False, if_exists = 'replace')

In [8]:
def get_odds():
    try:
        url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
        df = pd.read_html(url)
        data1 = df[0]
        date_try = str(year) + ' ' + data1.columns[0]
        date_try = pd.to_datetime(date_try, errors = 'coerce', format = '%Y %a %b %dth')
        data1['date'] = date_try
        data1.columns.values[0] = "Today"
        data1.reset_index(drop = True) 
        data1['Today'] = data1['Today'].str.replace("LA Clippers", "LAC Clippers", regex = True)
        data1['Today'] = data1['Today'].str.replace("AM", "AM ", regex = True)
        data1['Today'] = data1['Today'].str.replace("PM", "PM ", regex = True)
        data1['Time'] = data1['Today'].str.split().str[0] 
        data1['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data1['Time']) - timedelta(hours = 5)

        # could maybe filter out only that days games for the odds
        data2 = df[1]
        data2.columns.values[0] = "Today"
        data2.reset_index(drop = True)
        data2['Today'] = data2['Today'].str.replace("LA Clippers", "LAC Clippers", regex = True)
        data2['Today'] = data2['Today'].str.replace("AM", "AM ", regex = True)
        data2['Today'] = data2['Today'].str.replace("PM", "PM ", regex = True)
        data2['Time'] = data2['Today'].str.split().str[0]
        data2['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data2['Time']) - timedelta(hours = 5) + timedelta(days = 1)
        data2['date'] = data2['datetime1'].dt.date

        data = data1.append(data2).reset_index(drop = True)
        data['SPREAD'] = data['SPREAD'].str[:-4]
        data['TOTAL'] = data['TOTAL'].str[:-4]
        data['TOTAL'] = data['TOTAL'].str[2:]
        data['Today'] = data['Today'].str.split().str[1:2]
        data['Today'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in data['Today']])
        data['SPREAD'] = data['SPREAD'].str.replace("pk", "-1", regex = True)
        data['SPREAD'] = data['SPREAD'].str.replace("+", "", regex = True)
        data.columns = data.columns.str.lower()
        data = data[['today', 'spread', 'total', 'moneyline', 'date', 'datetime1']]
        data = data.rename(columns={data.columns[0]: 'team'})
        data = data.query('date == date.min()') # only grab games from upcoming day
        logging.info(f'Odds Function Successful, retrieving {len(data)} rows')
        print(f'Odds Function Successful, retrieving {len(data)} rows')
        return(data)
    except ValueError:
        logging.info("Odds Function Failed for Today's Games")
        print("Odds Function Failed for Today's Games")
        data = []
        return(data)

In [9]:
odds = get_odds()

Odds Function Failed for Today's Games


In [10]:
url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
df = pd.read_html(url)
data1 = df[0]
date_try = str(year) + ' ' + data1.columns[0]
date_try = pd.to_datetime(date_try, errors = 'coerce', format = '%Y %a %b %dth')
data1['date'] = date_try
data1.columns.values[0] = "Today"
data1.reset_index(drop = True) 
data1['Today'] = data1['Today'].str.replace("LA Clippers", "LAC Clippers", regex = True)
data1['Today'] = data1['Today'].str.replace("AM", "AM ", regex = True)
data1['Today'] = data1['Today'].str.replace("PM", "PM ", regex = True)
data1['Time'] = data1['Today'].str.split().str[0] 
data1['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data1['Time']) - timedelta(hours = 5)

# could maybe filter out only that days games for the odds
data2 = df[1]
data2.columns.values[0] = "Today"
data2.reset_index(drop = True)
data2['Today'] = data2['Today'].str.replace("LA Clippers", "LAC Clippers", regex = True)
data2['Today'] = data2['Today'].str.replace("AM", "AM ", regex = True)
data2['Today'] = data2['Today'].str.replace("PM", "PM ", regex = True)
data2['Time'] = data2['Today'].str.split().str[0]
data2['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data2['Time']) - timedelta(hours = 5) + timedelta(days = 1)
data2['date'] = data2['datetime1'].dt.date


ValueError: NaTType does not support strftime

In [19]:
odds.to_sql(con = conn, name = 'aws_odds_source', if_exists = 'replace', index = False)

In [12]:
# url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
# df = pd.read_html(url)
# data1 = df[0]
# date_try = str(year) + ' ' + data1.columns[0]
# date_try = pd.to_datetime(date_try, errors = 'coerce', format = '%Y %a %b %dth')
# data1['date'] = date_try
# data1.columns.values[0] = "Today"
# data1.reset_index(drop = True) 
# data1['Today'] = data1['Today'].str.replace("AM", "AM ", regex = True)
# data1['Today'] = data1['Today'].str.replace("PM", "PM ", regex = True)
# data1['Time'] = data1['Today'].str.split().str[0] 
data1['datetime1'] = str(date_try) + ' ' + data1['Time']
# data1['datetime1'] = pd.to_datetime(data1['datetime1'], format = "%Y-%m-%d %I:%M%p") - timedelta(hours = 5)

NameError: name 'date_try' is not defined

In [2]:
url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
df = pd.read_html(url)
# data1 = df[0]
# date_try = str(year) + ' ' + data1.columns[0]
# date_try = pd.to_datetime(date_try, errors = 'coerce', format = '%Y %a %b %dth')
# data1['date'] = date_try
# data1.columns.values[0] = "Today"
# data1.reset_index(drop = True) 
# data1['Today'] = data1['Today'].str.replace("AM", "AM ", regex = True)
# data1['Today'] = data1['Today'].str.replace("PM", "PM ", regex = True)
# data1['Time'] = data1['Today'].str.split().str[0] 
# data1['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data1['Time']) - timedelta(hours = 5)

In [1]:
data2 = df[1]
data2.columns.values[0] = "Today"
data2.reset_index(drop = True)
data2['Today'] = data2['Today']
# data2['Today'] = data2['Today'].str.replace("PM", "PM ", regex = True)
# data2['Time'] = data2['Today'].str.split().str[0]
# data2['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data2['Time']) - timedelta(hours = 5) + timedelta(days = 1)
# data2['date'] = data2['datetime1'].dt.date
data2

NameError: name 'df' is not defined

In [15]:
url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
df = pd.read_html(url)
data1 = df[0]
date_try = str(year) + ' ' + data1.columns[0]
date_try = pd.to_datetime(date_try, errors = 'coerce', format = '%Y %a %b %dth')
data1['date'] = date_try
data1.columns.values[0] = "Today"
data1.reset_index(drop = True) 
data1['Today'] = data1['Today'].str.replace("LA Clippers", "LAC Clippers", regex = True)
data1['Today'] = data1['Today'].str.replace("AM", "AM ", regex = True)
data1['Today'] = data1['Today'].str.replace("PM", "PM ", regex = True)
data1['Time'] = data1['Today'].str.split().str[0] 
data1['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data1['Time']) - timedelta(hours = 5)

# could maybe filter out only that days games for the odds
data2 = df[1]
data2.columns.values[0] = "Today"
data2.reset_index(drop = True)
data2['Today'] = data2['Today'].str.replace("LA Clippers", "LAC Clippers", regex = True)
data2['Today'] = data2['Today'].str.replace("AM", "AM ", regex = True)
data2['Today'] = data2['Today'].str.replace("PM", "PM ", regex = True)
data2['Time'] = data2['Today'].str.split().str[0]
data2['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' ' + data2['Time']) - timedelta(hours = 5) + timedelta(days = 1)
data2['date'] = data2['datetime1'].dt.date
data2

data = data1.append(data2).reset_index(drop = True)
data['SPREAD'] = data['SPREAD'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[2:]
data['Today'] = data['Today'].str.split().str[1:2]
data['Today'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in data['Today']])
data['SPREAD'] = data['SPREAD'].str.replace("pk", "-1", regex = True)
data['SPREAD'] = data['SPREAD'].str.replace("+", "", regex = True)
data.columns = data.columns.str.lower()
data = data[['today', 'spread', 'total', 'moneyline', 'date', 'datetime1']]
data = data.rename(columns={data.columns[0]: 'team'})
data = data.query('date == date.min()')

In [7]:
url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
df = pd.read_html(url)
data1 = df[1]
data1.columns.values[0] = "Today"
data1.reset_index(drop = True)
data1['Today'] = data1['Today'].str.replace("LA Clippers", "LAC Clippers", regex = True)

In [14]:
data['date'].min()

Timestamp('2021-10-19 00:00:00')

In [10]:
time1 = "2021-10-17" + " 11:40PM"
time1 = pd.to_datetime(time1) - timedelta(hours = 5)

In [48]:
data1['datetime1'] = pd.to_datetime(date_try.strftime('%Y-%m-%d') + ' 11:40PM') - timedelta(hours = 5)

In [34]:
date_try = data1['try'].drop_duplicates()[0]
odds['blah'] = date_try

In [3]:
url = "https://sportsbook.draftkings.com/leagues/basketball/88670846?category=game-lines&subcategory=game"
df = pd.read_html(url)
data1 = df[0]
date_try = str(year) + ' ' + data1.columns[0]
date_try = pd.to_datetime(date_try, errors = 'coerce', format = '%Y %a %b %dth')
data1['new_date_2'] = date_try
data1.columns.values[0] = "Today"
data1.reset_index(drop = True) 
data1['Today'] = data1['Today'].str.replace("AM", "AM ", regex = True)
data1['Today'] = data1['Today'].str.replace("PM", "PM ", regex = True)
data1['Time'] = data1['Today'].str.split().str[0] 
data1['date'] = str(datetime.now().date())
data1['datetime1'] = data1['date'] + ' ' + data1['Time']
data1['datetime1'] = pd.to_datetime(data1['datetime1'], format = "%Y-%m-%d %I:%M%p") - timedelta(hours = 5)
data1['new_date'] = data1['datetime1'].dt.date
data1

KeyError: 'Today'

In [4]:
data1

Unnamed: 0,Today,SPREAD,TOTAL,MONEYLINE,new_date_2
0,11:40PMBKN Nets,pk-110,O 239.5-110,-110,2021-10-19
1,11:40PMMIL Bucks,pk-110,U 239.5-110,-110,2021-10-19


In [56]:
data1 = df[0]
data2 = df[1]
data3 = df[2]

In [15]:
# data 1 is today's date'
# data2 give tmmws date.
# THEN -5 hrs from there
data = df[0]
data.columns.values[0] = "Today"
data.reset_index(drop = True)
data['Today'] = data['Today'].str.replace("AM", "AM ", regex = True)
data['Today'] = data['Today'].str.replace("PM", "PM ", regex = True)
data['Time'] = data['Today'].str.split().str[0] 
data['date'] = str(datetime.now().date())
data['datetime1'] = data['date'] + ' ' + data['Time']
data['datetime1'] = pd.to_datetime(data['datetime1'], format = "%Y-%m-%d %I:%M%p") - timedelta(hours = 5)
data['new_date'] = data['datetime1'].dt.date
data

Unnamed: 0,Today,SPREAD,TOTAL,MONEYLINE,Time,date,datetime1,new_date
0,11:40PM BKN Nets,pk-110,O 239.5-110,-110,11:40PM,2021-10-02,2021-10-02 18:40:00,2021-10-02
1,11:40PM MIL Bucks,pk-110,U 239.5-110,-110,11:40PM,2021-10-02,2021-10-02 18:40:00,2021-10-02


In [57]:
data = df[1]
data.columns.values[0] = "Today"
data.reset_index(drop = True)
data['Today'] = data['Today'].str.replace("AM", "AM ", regex = True)
data['Today'] = data['Today'].str.replace("PM", "PM ", regex = True)
data['Time'] = data['Today'].str.split().str[0]
data['date'] = str(datetime.now().date() + timedelta(days = 1)) 
data['datetime1'] = data['date'] + ' ' + data['Time']
data['datetime1'] = pd.to_datetime(data['datetime1'], format = "%Y-%m-%d %I:%M%p") - timedelta(hours = 5)
data['new_date'] = data['datetime1'].dt.date
data

Unnamed: 0,Today,SPREAD,TOTAL,MONEYLINE,Time,date,datetime1,new_date
0,2:10AM GS Warriors,+4.5-110,O 229.5-110,170,2:10AM,2021-10-03,2021-10-02 21:10:00,2021-10-02
1,2:10AM LA Lakers,-4.5-110,U 229.5-110,-200,2:10AM,2021-10-03,2021-10-02 21:10:00,2021-10-02
2,11:10PM CHI Bulls,-2.5-110,O 221.5-110,-140,11:10PM,2021-10-03,2021-10-03 18:10:00,2021-10-03
3,11:10PM DET Pistons,+2.5-110,U 221.5-110,120,11:10PM,2021-10-03,2021-10-03 18:10:00,2021-10-03
4,11:10PM IND Pacers,+2-110,O 227.5-110,110,11:10PM,2021-10-03,2021-10-03 18:10:00,2021-10-03
5,11:10PM CHA Hornets,-2-110,U 227.5-110,-130,11:10PM,2021-10-03,2021-10-03 18:10:00,2021-10-03
6,11:40PM WAS Wizards,+3-110,O 218.5-110,135,11:40PM,2021-10-03,2021-10-03 18:40:00,2021-10-03
7,11:40PM TOR Raptors,-3-110,U 218.5-110,-155,11:40PM,2021-10-03,2021-10-03 18:40:00,2021-10-03
8,11:40PM BOS Celtics,+1-110,O 218.5-110,100,11:40PM,2021-10-03,2021-10-03 18:40:00,2021-10-03
9,11:40PM NY Knicks,-1-110,U 218.5-110,-120,11:40PM,2021-10-03,2021-10-03 18:40:00,2021-10-03


In [7]:
data1 = df[0]
data1.columns.values[0] = "Today"
data1.reset_index(drop = True)
data1['Today'] = data1['Today'].str.replace("AM", "AM ", regex = True)
data1['Today'] = data1['Today'].str.replace("PM", "PM ", regex = True)
data1['Time'] = data1['Today'].str.split().str[0] 
data1['date'] = str(datetime.now().date())
data1['datetime1'] = data1['date'] + ' ' + data1['Time']
data1['datetime1'] = pd.to_datetime(data1['datetime1'], format = "%Y-%m-%d %I:%M%p") - timedelta(hours = 5)
data1['new_date'] = data1['datetime1'].dt.date
data1

data2 = df[1]
data2.columns.values[0] = "Today"
data2.reset_index(drop = True)
data2['Today'] = data2['Today'].str.replace("AM", "AM ", regex = True)
data2['Today'] = data2['Today'].str.replace("PM", "PM ", regex = True)
data2['Time'] = data2['Today'].str.split().str[0]
data2['date'] = str(datetime.now().date() + timedelta(days = 1)) 
data2['datetime1'] = data2['date'] + ' ' + data2['Time']
data2['datetime1'] = pd.to_datetime(data2['datetime1'], format = "%Y-%m-%d %I:%M%p") - timedelta(hours = 5)
data2['new_date'] = data2['datetime1'].dt.date
data2

data = data1.append(data2).reset_index(drop = True)
data['SPREAD'] = data['SPREAD'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[2:]
data['Today'] = data['Today'].str.split().str[1:2]
data['Today'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in data['Today']])
data['SPREAD'] = data['SPREAD'].str.replace("pk", "-1", regex = True)
data['SPREAD'] = data['SPREAD'].str.replace("+", "", regex = True)
data.columns = data.columns.str.lower()
data = data[['today', 'spread', 'total', 'moneyline', 'datetime1', 'new_date']]
data = data.rename(columns={data.columns[0]: 'team', data.columns[4]: 'time', data.columns[5]: 'date'})

In [22]:
data1 = df[0]
data2 = df[1]
data1.columns.values[0] = "Today"
data2.columns.values[0] = "Today"
data = data1.append(data2).reset_index(drop = True)
data
data['SPREAD'] = data['SPREAD'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[2:]
data.reset_index(drop = True)


data['Today'] = data['Today'].str.replace("AM", "AM ", regex = True)
data['Today'] = data['Today'].str.replace("PM", "PM ", regex = True)
data['Time'] = data['Today'].str.split().str[0]
data['newTime'] = pd.to_datetime(data['Time']) - timedelta(hours = 5)
data['finaltime'] = data['newTime'].dt.time
data['Today'] = data['Today'].str.split().str[1:2]
data['Today'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in data['Today']])
data['SPREAD'] = data['SPREAD'].str.replace("pk", "-1", regex = True)
data['SPREAD'] = data['SPREAD'].str.replace("+", "", regex = True)
data.columns = data.columns.str.lower()
data = data[['today', 'spread', 'total', 'moneyline', 'finaltime']]
data = data.rename(columns={data.columns[0]: 'team', data.columns[4]: 'time'})
data.head(12)

KeyError: 'Today'

In [41]:
data1 = df[0]
data2 = df[1]
data1.columns.values[0] = "Today"
data2.columns.values[0] = 'Today'
# data = data1.append(data2)
# data
data['SPREAD'] = data['SPREAD'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[:-4]
data['TOTAL'] = data['TOTAL'].str[2:]
# data.reset_index(drop = True)
# data

# data['Today'] = data['Today'].str.replace("AM|PM", " ")
# data['Today'] = data['Today'].str.split().str[1:2]
# data['Today'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in data['Today']])
# data = data.rename(columns = {"Today": "team", "SPREAD": "spread", "TOTAL": "total_pts", "MONEYLINE": "moneyline"})

In [42]:
data

Unnamed: 0,Today,SPREAD,TOTAL,MONEYLINE
0,11:40PMBKN Nets,,,-110
1,11:40PMMIL Bucks,,,-110
0,2:10AMGS Warriors,,,170
1,2:10AMLA Lakers,,,-200
2,11:10PMCHI Bulls,,,-140
3,11:10PMDET Pistons,,,120
4,11:10PMIND Pacers,,,110
5,11:10PMCHA Hornets,,,-130
6,11:40PMWAS Wizards,,,135
7,11:40PMTOR Raptors,,,-155


In [10]:
conn = sql_connection()
stats = get_player_stats()
boxscores = get_boxscores(month = month, day = yesterday, year = year)
injury_data = get_injuries()
transactions = get_transactions()
adv_stats = get_advanced_stats()
odds = get_odds()

SQL Connection Successful
General Stats Function Successful, retrieving 731 updated rows
Box Score Function Successful, retrieving 17 rows for 2021-08-25
Injury Function Successful, retrieving 17 rows
Transactions Function Successful, retrieving 823 rows
Advanced Stats Function Successful, retrieving updated data for 30 Teams
Odds Function Failed for Today's Games


In [312]:
def write_to_sql(data, table_type):
    data_name = [ k for k,v in globals().items() if v is data][0]
    if len(data) == 0:
        print(data_name + " Failed, not writing to SQL")
        logging.info(data_name + " Failed, not writing to SQL")
    else:
        # ^ this disgusting monstrosity is to get the name of the -fucking- dataframe lmfao
        data.to_sql(con = conn, name = ("aws_" + data_name + "_table"), index = False, if_exists = table_type)
        print("Writing aws_" + data_name + "_table to SQL")
        logging.info("Writing " + data_name + " table to SQL")

In [314]:
conn = sql_connection()
write_to_sql(pbp_list, "append")

SQL Connection Successful
Writing pbp_list table to SQL


In [2]:
logs = pd.read_csv('example.log', sep=r'\\t', engine='python', header = None)
logs = logs.rename(columns = {0 : "errors"})
logs = logs.query("errors.str.contains('Failed')", engine = "python")

In [62]:
len(logs)

4

In [63]:
def send_email_function():
    try:
        if len(logs) > 0:
            print(logs)
            # Send email
        elif len(logs) == 0:
            print('No Errors!')
            ## DONT SEND EMAIL
    except ValueError:
        print('oof')

In [98]:
send_email_function()

                                               errors
24  08/25/2021 09:05:26 PM Odds Function Failed fo...


In [None]:
write_to_sql(stats, "replace")
write_to_sql(boxscores, "append")
write_to_sql(injury_data, "append")
write_to_sql(transactions, "replace")
write_to_sql(adv_stats, "replace")
write_to_sql(odds, "append")

In [12]:
stats = get_player_stats()

General Stats Function Successful, retrieving 731 updated rows


In [64]:
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

def sendEmail():
    email = os.environ.get("USER_EMAIL") # the email where you sent the email
    password = os.environ.get("USER_PW")
    send_to_email = os.environ.get("USER_EMAIL") # for whom
    message = '''\
<h3>sup hoe here are the errors.</h3>
                   {}'''.format(logs.to_html())

    msg = MIMEMultipart()
    msg["From"] = email
    msg["To"] = send_to_email
    msg["Subject"] = str(len(logs)) +" Alert Fails for " + str(today) + ' Python NBA Web Scrape'
    msg.attach(MIMEText(message, 'html'))

    server = smtplib.SMTP("smtp.gmail.com", 587)
    server.starttls()
    server.login(email, password)
    text = msg.as_string()
    server.sendmail(email, send_to_email, text)
    server.quit()

In [65]:
sendEmail()

In [5]:
def send_aws_email():
    sender = os.environ.get("USER_EMAIL")
    recipient = os.environ.get("USER_EMAIL")
    aws_region = 'us-east-1'
    subject = str(len(logs)) +" Alert Fails for " + str(today) + ' Python NBA Web Scrape'
    body_html = message = '''\
<h3>sup hoe here are the errors.</h3>
                   {}'''.format(logs.to_html())

    charset = "UTF-8"
    client = boto3.client('ses',region_name=aws_region)
    try:
    #Provide the contents of the email.
        response = client.send_email(
            Destination={
                'ToAddresses': [
                    recipient,
                ],
            },
            Message={
                'Body': {
                    'Html': {
                        'Charset': charset,
                        'Data': body_html,
                    },
                    'Text': {
                        'Charset': charset,
                        'Data': body_html,
                    },
                },
                'Subject': {
                    'Charset': charset,
                    'Data': subject,
                },
            },
            Source = sender
            # If you are not using a configuration set, comment or delete the
            # following line
            # ConfigurationSetName=CONFIGURATION_SET,
    )
# Display an error if something goes wrong.	
    except ClientError as e:
        print(e.response['Error']['Message'])
    else:
        print("Email sent! Message ID:"),
        print(response['MessageId'])

In [6]:
send_aws_email()

Email sent! Message ID:
0100017bb2099fb1-62c47da2-d260-4268-8a6e-b150c0e6b787-000000


In [51]:
def send_email_function():
    try:
        if len(logs) > 0:
            print('Sending Email')
            sendEmail()
        elif len(logs) == 0:
            print('No Errors!')
            ## DONT SEND EMAIL
    except ValueError:
        print('oof')

In [57]:

send_email_function()

No Errors!


In [None]:
### Testing


In [18]:
df = pd.read_html('https://www.basketball-reference.com/contracts/players.html', header = 1)[0]

In [19]:

df = df.rename(columns={df.columns[2]: 'team', df.columns[3]: 'season_salary'})
df = df[['Player', 'team', 'season_salary']]
df.columns = df.columns.str.lower()
df = df.drop_duplicates()
df = df.query('season_salary != "Salary" & season_salary != "2021-22"')
df['season_salary'] = df['season_salary'].str.replace(',', "", regex = True)
df['season_salary'] = df['season_salary'].str.replace('$', "", regex = True)
df['team'] = df['team'].str.replace("PHO", "PHX")
df['team'] = df['team'].str.replace("CHO", "CHA")
df['team'] = df['team'].str.replace("BRK", "BKN")
df['season_salary'] = pd.to_numeric(df['season_salary'])
df['player'] = df['player'].str.normalize('NFKD').str.encode('ascii', errors = 'ignore').str.decode('utf-8')
df

Unnamed: 0,player,team,season_salary
0,Stephen Curry,GSW,45780966.0
1,John Wall,HOU,44310840.0
2,Russell Westbrook,LAL,44211146.0
3,James Harden,BKN,43848000.0
4,Damian Lillard,POR,43750000.0
...,...,...,...
611,Patrick Patterson,OKC,3378757.0
612,Shaun Livingston,GSW,1333332.0
616,Admiral Schofield,OKC,1789065.0
617,Troy Williams,HOU,122741.0


In [20]:
df.to_sql(con = conn, name = 'aws_contracts_source', if_exists = 'replace', index = False)

In [35]:
df = df.rename(columns={df.columns[0]: 'Time',

In [28]:
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Salary,Salary,Salary,Salary,Salary,Salary,Unnamed: 9_level_0,Unnamed: 10_level_0
Unnamed: 0_level_1,Player,Tm,2021-22,2022-23,2023-24,2024-25,2025-26,2026-27,Signed Using,Guaranteed
0,Stephen Curry,GSW,"$45,780,966","$48,070,014","$51,915,615","$55,761,216","$59,606,817",,Bird Rights,"$261,134,628"
1,John Wall,HOU,"$44,310,840","$47,366,760",,,,,Bird Rights,"$44,310,840"
2,Russell Westbrook,LAL,"$44,211,146","$47,063,478",,,,,Bird Rights,"$44,211,146"
3,James Harden,BRK,"$43,848,000","$46,872,000",,,,,Bird Rights,"$43,848,000"
4,Damian Lillard,POR,"$43,750,000","$47,250,000","$50,750,000","$54,250,000",,,1st Round Pick,"$196,000,000"
...,...,...,...,...,...,...,...,...,...,...
620,Shaun Livingston,GSW,"$1,333,332",,,,,,,"$1,333,332"
621,Shaun Livingston,GSW,"$1,333,332",,,,,,,"$1,333,332"
622,Admiral Schofield,OKC,"$1,789,065",,,,,,,"$300,000"
623,Troy Williams,HOU,"$122,741","$122,741",,,,,,"$245,482"
