In [1]:
import os
import sys
from datetime import datetime, timezone, timedelta
from urllib.request import urlopen
from urllib.error import HTTPError
import logging
from bs4 import BeautifulSoup
from sqlalchemy import exc, create_engine
import pymysql
import numpy as np
import pandas as pd
import boto3
from botocore.exceptions import ClientError

logging.basicConfig(filename='example.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info('Starting Logging Function')

today = datetime.now().date()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year
season_type = 'Regular Season'

In [2]:
def sql_connection():
    try:
        connection = create_engine('postgresql+psycopg2://' + os.environ.get('RDS_USER') + ':' + os.environ.get('RDS_PW') + '@' + os.environ.get('IP') + ':' + '5432' + '/' + os.environ.get('RDS_DB'),
                                    connect_args = {'options': '-csearch_path=nba_source'}, # defining schema to connect to
                     echo = False)
        logging.info('SQL Connection Successful')
        print('SQL Connection Successful')
        return(connection)
    except exc.SQLAlchemyError as e:
        logging.info('SQL Connection Failed, Error:', e)
        print('SQL Connection Failed, Error:', e)
        return(e)
conn = sql_connection()

SQL Connection Successful


In [None]:
def schedule_scraper(month):
    try:
        global schedule_df
        url = "https://www.basketball-reference.com/leagues/NBA_2022_games-{}.html".format(month)
        html = urlopen(url)
        soup = BeautifulSoup(html)

        headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
        headers[6] = 'boxScoreLink'
        headers[7] = 'isOT'
        headers = headers[1:]

        rows = soup.findAll('tr')[1:]
        date_info = [[th.getText() for th in rows[i].findAll('th')]
                for i in range(len(rows))]

        game_info = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
        date_info = [i[0] for i in date_info]

        schedule = pd.DataFrame(game_info, columns = headers)
        schedule['Date'] = date_info

        schedule_df = schedule_df.append(schedule)
        logging.info(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
        print(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
    except ValueError:
        logging.info("Schedule Scraper Function Failed")
        print("Schedule Scraper Function Failed")
        df = []
        return(df)

month_list = ['october', 'november', 'december', 'january', 'february', 'march', 'april']
schedule_df = pd.DataFrame()
for month in month_list:
    schedule_scraper(month)

In [None]:
schedule_df = schedule_df[['Start (ET)', 'Visitor/Neutral', 'Home/Neutral', 'Date']]
schedule_df['proper_date'] = pd.to_datetime(schedule_df['Date']).dt.date
schedule_df.columns = schedule_df.columns.str.lower()
schedule_df = schedule_df.rename(columns = {"start (et)": "start_time", "visitor/neutral": "away_team", "home/neutral": "home_team"})

In [None]:
schedule_df.to_sql(con = conn, name = "aws_schedule_source", if_exists = 'replace', index = False)

In [7]:
def get_contracts():
    df = pd.read_html('https://www.basketball-reference.com/contracts/players.html', header = 1)[0]
    df = df.rename(columns={df.columns[2]: 'team', df.columns[3]: 'season_salary'})
    df = df[['Player', 'team', 'season_salary']]
    df.columns = df.columns.str.lower()
    df = df.drop_duplicates()
    df = df.query('season_salary != "Salary" & season_salary != "2021-22"').reset_index()
    df['season_salary'] = df['season_salary'].str.replace(',', "", regex = True)
    df['season_salary'] = df['season_salary'].str.replace('$', "", regex = True)
    df['team'] = df['team'].str.replace("PHO", "PHX")
    df['team'] = df['team'].str.replace("CHO", "CHA")
    df['team'] = df['team'].str.replace("BRK", "BKN")
    df['season_salary'] = pd.to_numeric(df['season_salary'])
    df['player'] = df['player'].str.normalize('NFKD').str.encode('ascii', errors = 'ignore').str.decode('utf-8')
    df = df.reset_index(drop = True)
    return(df)

In [8]:
contracts = get_contracts()

In [9]:
contracts.to_sql(con = conn, name = "aws_contracts_source", if_exists = 'replace', index = False)

In [5]:
url = 'https://www.basketball-reference.com/leagues/NBA_2022_preseason_odds.html'
df = pd.read_html(url)[0]
df

Unnamed: 0,Team,Odds,Unnamed: 2,W-L O/U,Result
0,Brooklyn Nets,230,,56.5,0-1 (under)
1,Los Angeles Lakers,425,,52.5,0-1 (under)
2,Milwaukee Bucks,850,,54.5,1-0 (under)
3,Golden State Warriors,1100,,48.5,1-0 (under)
4,Utah Jazz,1400,,52.5,1-0 (under)
5,Phoenix Suns,1400,,51.5,0-1 (under)
6,Philadelphia 76ers,1600,,50.5,1-0 (under)
7,Los Angeles Clippers,1600,,45.5,0-0 (under)
8,Denver Nuggets,2200,,47.5,1-0 (under)
9,Dallas Mavericks,2800,,48.5,0-0 (under)


In [8]:
df = df.rename(columns = {df.columns[3]: 'predicted'})
df.columns = df.columns.str.lower()
df = df[['team', 'odds', 'predicted']]

In [10]:
df.to_sql(con = conn, name = 'aws_preseason_odds_source', if_exists = 'replace', index = False)

In [5]:
url = 'https://www.basketball-reference.com/leagues/NBA_2022.html'
df = pd.read_html(url)[5]

In [22]:
def get_opp_stats():
    try:
        url = 'https://www.basketball-reference.com/leagues/NBA_2022.html'
        df = pd.read_html(url)[5]
        df = df[['Team', 'FG%', '3P%', '3P', 'PTS']]
        df = df.rename(columns = {df.columns[0]: 'team', df.columns[1]: 'fg_percent_opp', df.columns[2]: 'threep_percent_opp',
                        df.columns[3]: 'threep_made_opp', df.columns[4]: 'ppg_opp'})
        df = df.query('team != "League Average"')
        df = df.reset_index(drop = True)
        return(df)
    except IndexError:
        logging.info("Opp Stats Function Failed for Today's Games")
        print("Opp Stats Function Failed for Today's Games")
        df = []
        return df


In [23]:
df = get_opp_stats()

In [32]:
def get_boxscores(month = month, day = day, year = year):
    url = "https://www.basketball-reference.com/friv/dailyleaders.fcgi?month={}&day={}&year={}&type=all".format(month, day, year)
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")

    try: 
        headers = [th.getText() for th in soup.findAll('tr', limit=2)[0].findAll('th')]
        headers = headers[1:]
        headers[1] = 'Team'
        headers[2] = "Location"
        headers[3] = 'Opponent'
        headers[4] = "Outcome"
        headers[6] = "FGM"
        headers[8] = "FGPercent"
        headers[9] = "threePFGMade"
        headers[10] = "threePAttempted"
        headers[11] = "threePointPercent"
        headers[14] = "FTPercent"
        headers[15] = "OREB"
        headers[16] = "DREB"
        headers[24] = 'PlusMinus'

        rows = soup.findAll('tr')[1:]
        player_stats = [[td.getText() for td in rows[i].findAll('td')]
            for i in range(len(rows))]

        df = pd.DataFrame(player_stats, columns = headers)
        df[['FGM', 'FGA', 'FGPercent', 'threePFGMade', 'threePAttempted', 'threePointPercent', 'OREB', 'DREB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PlusMinus', 'GmSc']] = df[['FGM', 'FGA', 'FGPercent', 'threePFGMade', 'threePAttempted', 'threePointPercent','OREB', 'DREB', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS', 'PlusMinus', 'GmSc']].apply(pd.to_numeric)
        df['date'] = str(year) + '-' + str(month) + '-' + str(day)
        df['date'] = pd.to_datetime(df['date'])
        df['Type'] = season_type
        df['Season'] = 2022
        df['Location'] = df['Location'].apply(lambda x: 'A' if x == '@' else 'H')
        df['Team'] = df['Team'].str.replace("PHO", "PHX")
        df['Team'] = df['Team'].str.replace("CHO", "CHA")
        df['Team'] = df['Team'].str.replace("BRK", "BKN")
        df['Opponent'] = df['Opponent'].str.replace("PHO", "PHX")
        df['Opponent'] = df['Opponent'].str.replace("CHO", "CHA")
        df['Opponent'] = df['Opponent'].str.replace("BRK", "BKN")
        df = df.query('Player == Player').reset_index()
        df['Player'] = df['Player'].str.normalize('NFKD').str.encode('ascii', errors = 'ignore').str.decode('utf-8')
        df.columns = df.columns.str.lower()
        logging.info(f'Box Score Function Successful, retrieving {len(df)} rows for {year}-{month}-{day}')
        print(f'Box Score Function Successful, retrieving {len(df)} rows for {year}-{month}-{day}')
        return(df)
    except (ValueError, IndexError, HTTPError) as error:
        logging.info(f"Box Score Function Failed, {error}, no data available for {year}-{month}-{day}")
        print(f"Box Score Function Failed, {error}, no data available for {year}-{month}-{day}")
        df = []
        return(df)

In [33]:
df = get_boxscores()

HTTPError: HTTP Error 404: Not Found