In [2]:
import os
import sys
from datetime import datetime, timezone, timedelta
from urllib.request import urlopen
import logging
from bs4 import BeautifulSoup
from sqlalchemy import exc, create_engine
import pymysql
import numpy as np
import pandas as pd
import boto3
from botocore.exceptions import ClientError

logging.basicConfig(filename='example.log', level=logging.DEBUG, format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p')
logging.info('Starting Logging Function')

today = datetime.now().date()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year
season_type = 'Regular Season'

In [None]:
def sql_connection():
    try:
        connection = create_engine('postgresql+psycopg2://' + os.environ.get('RDS_USER') + ':' + os.environ.get('RDS_PW') + '@' + os.environ.get('IP') + ':' + '5432' + '/' + os.environ.get('RDS_DB'),
                                    connect_args = {'options': '-csearch_path=nba_source'}, # defining schema to connect to
                     echo = False)
        logging.info('SQL Connection Successful')
        print('SQL Connection Successful')
        return(connection)
    except exc.SQLAlchemyError as e:
        logging.info('SQL Connection Failed, Error:', e)
        print('SQL Connection Failed, Error:', e)
        return(e)
conn = sql_connection()

In [None]:
def schedule_scraper(month):
    try:
        global schedule_df
        url = "https://www.basketball-reference.com/leagues/NBA_2022_games-{}.html".format(month)
        html = urlopen(url)
        soup = BeautifulSoup(html)

        headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
        headers[6] = 'boxScoreLink'
        headers[7] = 'isOT'
        headers = headers[1:]

        rows = soup.findAll('tr')[1:]
        date_info = [[th.getText() for th in rows[i].findAll('th')]
                for i in range(len(rows))]

        game_info = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
        date_info = [i[0] for i in date_info]

        schedule = pd.DataFrame(game_info, columns = headers)
        schedule['Date'] = date_info

        schedule_df = schedule_df.append(schedule)
        logging.info(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
        print(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
    except ValueError:
        logging.info("Schedule Scraper Function Failed")
        print("Schedule Scraper Function Failed")
        df = []
        return(df)

month_list = ['october', 'november', 'december', 'january', 'february', 'march', 'april']
schedule_df = pd.DataFrame()
for month in month_list:
    schedule_scraper(month)

In [None]:
schedule_df = schedule_df[['Start (ET)', 'Visitor/Neutral', 'Home/Neutral', 'Date']]
schedule_df['proper_date'] = pd.to_datetime(schedule_df['Date']).dt.date
schedule_df.columns = schedule_df.columns.str.lower()
schedule_df = schedule_df.rename(columns = {"start (et)": "start_time", "visitor/neutral": "away_team", "home/neutral": "home_team"})

In [None]:
schedule_df.to_sql(con = conn, name = "aws_schedule_source", if_exists = 'replace', index = False)

In [8]:
def get_contracts():
    df = pd.read_html('https://www.basketball-reference.com/contracts/players.html', header = 1)[0]
    df = df.rename(columns={df.columns[2]: 'team', df.columns[3]: 'season_salary'})
    df = df[['Player', 'team', 'season_salary']]
    df.columns = df.columns.str.lower()
    df = df.drop_duplicates()
    df = df.query('season_salary != "Salary" & season_salary != "2021-22"').reset_index()
    df['season_salary'] = df['season_salary'].str.replace(',', "", regex = True)
    df['season_salary'] = df['season_salary'].str.replace('$', "", regex = True)
    df['team'] = df['team'].str.replace("PHO", "PHX")
    df['team'] = df['team'].str.replace("CHO", "CHA")
    df['team'] = df['team'].str.replace("BRK", "BKN")
    df['season_salary'] = pd.to_numeric(df['season_salary'])
    df['player'] = df['player'].str.normalize('NFKD').str.encode('ascii', errors = 'ignore').str.decode('utf-8')
    return(df)

In [9]:
contracts = get_contracts()

In [None]:
contracts.to_sql(con = conn, name = "aws_contracts_source", if_exists = 'replace', index = False)