In [2]:
import os
import logging
import requests
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import praw
from bs4 import BeautifulSoup
from sqlalchemy import exc, create_engine
import boto3
from botocore.exceptions import ClientError
import twint
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

today = datetime.now().date()
todaytime = datetime.now()
yesterday = today - timedelta(1)
day = (datetime.now() - timedelta(1)).day
month = (datetime.now() - timedelta(1)).month
year = (datetime.now() - timedelta(1)).year
if today < datetime(2022, 4, 11).date():
    season_type = "Regular Season"
elif (today >= datetime(2022, 4, 11).date()) & (today < datetime(2022, 4, 16).date()):
    season_type = "Play-In"
else:
    season_type = "Playoffs"

In [None]:
def write_to_sql(con, data, table_type):
    """
    SQL Table function to write a pandas data frame in aws_dfname_source format
    Args:
        data: The Pandas DataFrame to store in SQL
        table_type: Whether the table should replace or append to an existing SQL Table under that name
    Returns:
        Writes the Pandas DataFrame to a Table in Snowflake in the {nba_source} Schema we connected to.
    """
    try:
        data_name = [k for k, v in globals().items() if v is data][0]
        # ^ this disgusting monstrosity is to get the name of the -fucking- dataframe lmfao
        if len(data) == 0:
            logging.info(f"{data_name} is empty, not writing to SQL")
        else:
            data.to_sql(
                con=con,
                name=f"aws_{data_name}_source",
                index=False,
                if_exists=table_type,
            )
            logging.info(f"Writing aws_{data_name}_source to SQL")
    except BaseException as error:
        logging.error(f"SQL Write Script Failed, {error}")
        return error

def sql_connection(rds_schema: str):
    """
    SQL Connection function connecting to my postgres db with schema = nba_source where initial data in ELT lands
    Args:
        None
    Returns:
        SQL Connection variable to schema: nba_source in my PostgreSQL DB
    """
    RDS_USER = os.environ.get("RDS_USER")
    RDS_PW = os.environ.get("RDS_PW")
    RDS_IP = os.environ.get("IP")
    RDS_DB = os.environ.get("RDS_DB")
    try:
        connection = create_engine(
            f"postgresql+psycopg2://{RDS_USER}:{RDS_PW}@{RDS_IP}:5432/{RDS_DB}",
            connect_args={"options": f"-csearch_path={rds_schema}"},
            # defining schema to connect to
            echo=False,
        )
        logging.info(f"SQL Connection to schema: {rds_schema} Successful")
        return connection
    except exc.SQLAlchemyError as e:
        logging.error(f"SQL Connection to schema: {rds_schema} Failed, Error: {e}")
        return e

conn = sql_connection(os.environ.get("RDS_SCHEMA"))

In [2]:
def get_contracts():
    df = pd.read_html('https://www.basketball-reference.com/contracts/players.html', header = 1)[0]
    df = df.rename(columns={df.columns[2]: 'team', df.columns[3]: 'season_salary'})
    df = df[['Player', 'team', 'season_salary']]
    df.columns = df.columns.str.lower()
    df = df.drop_duplicates()
    df = df.query('season_salary != "Salary" & season_salary != "2021-22"').reset_index()
    df['season_salary'] = df['season_salary'].str.replace(',', "", regex = True)
    df['season_salary'] = df['season_salary'].str.replace('$', "", regex = True)
    df['team'] = df['team'].str.replace("PHO", "PHX")
    df['team'] = df['team'].str.replace("CHO", "CHA")
    df['team'] = df['team'].str.replace("BRK", "BKN")
    df['season_salary'] = pd.to_numeric(df['season_salary'])
    df['player'] = df['player'].str.normalize('NFKD').str.encode('ascii', errors = 'ignore').str.decode('utf-8')
    df['player'] = df['player'].str.replace(" Jr.", "", regex = True)
    df['player'] = df['player'].str.replace(" Sr.", "", regex = True)
    df['player'] = df['player'].str.replace(" II", "", regex = True)
    df['player'] = df['player'].str.replace(" III", "", regex = True)
    df['player'] = df['player'].str.replace(" IV", "", regex = True)
    df = df.reset_index(drop = True)
    return(df)

contracts = get_contracts()

In [None]:
contracts.to_sql(con = conn, name = "aws_contracts_source", if_exists = 'replace', index = False)

In [3]:
def schedule_scraper(month):
    try:
        global schedule_df
        url = f"https://www.basketball-reference.com/leagues/NBA_2022_games-{month}.html"
        html = requests.get(url).content
        soup = BeautifulSoup(html)

        headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
        headers[6] = 'boxScoreLink'
        headers[7] = 'isOT'
        headers = headers[1:]

        rows = soup.findAll('tr')[1:]
        date_info = [[th.getText() for th in rows[i].findAll('th')]
                for i in range(len(rows))]

        game_info = [[td.getText() for td in rows[i].findAll('td')]
                for i in range(len(rows))]
        date_info = [i[0] for i in date_info]

        schedule = pd.DataFrame(game_info, columns = headers)
        schedule['Date'] = date_info

        schedule_df = schedule_df.append(schedule)
        logging.info(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
        print(f'Schedule Function Completed for {month}, retrieving {len(schedule_df)} rows')
    except BaseException as e:
        logging.info(f"Schedule Scraper Function Failed, {e}")
        print(f"Schedule Scraper Function Failed, {e}")
        df = []
        return(df)

month_list = ['october', 'november', 'december', 'january', 'february', 'march', 'april']
schedule_df = pd.DataFrame()
for month in month_list:
    schedule_scraper(month)

schedule_df = schedule_df[['Start (ET)', 'Visitor/Neutral', 'Home/Neutral', 'Date']]
schedule_df['proper_date'] = pd.to_datetime(schedule_df['Date']).dt.date
schedule_df.columns = schedule_df.columns.str.lower()
schedule_df = schedule_df.rename(columns = {"start (et)": "start_time", "visitor/neutral": "away_team", "home/neutral": "home_team"})

Schedule Function Completed for october, retrieving 93 rows
Schedule Function Completed for november, retrieving 318 rows
Schedule Function Completed for december, retrieving 527 rows
Schedule Function Completed for january, retrieving 758 rows
Schedule Function Completed for february, retrieving 921 rows
Schedule Function Completed for march, retrieving 1150 rows
Schedule Function Completed for april, retrieving 1230 rows


In [25]:
from typing import List
# month_list_df = ['april', 'may', 'june']
def schedule_scraper(year: str, month_list: List[str]) -> pd.DataFrame:
    """
    Web Scrape Function to scrape Schedule data by iterating through a list of months

    Args:
        year (str) - The year to scrape

        month_list (list) - List of full-month names to scrape
    
    Returns:
        DataFrame of Schedule Data to be stored.
    
    """
    try:
        schedule_df = pd.DataFrame()
        completed_months = []
        for i in month_list:
            url = f"https://www.basketball-reference.com/leagues/NBA_{year}_games-{i}.html"
            html = requests.get(url).content
            soup = BeautifulSoup(html)

            headers = [th.getText() for th in soup.findAll('tr')[0].findAll('th')]
            headers[6] = 'boxScoreLink'
            headers[7] = 'isOT'
            headers = headers[1:]

            rows = soup.findAll('tr')[1:]
            date_info = [[th.getText() for th in rows[i].findAll('th')]
                    for i in range(len(rows))]

            game_info = [[td.getText() for td in rows[i].findAll('td')]
                    for i in range(len(rows))]
            date_info = [i[0] for i in date_info]

            schedule = pd.DataFrame(game_info, columns = headers)
            schedule['Date'] = date_info

            logging.info(f'Schedule Function Completed for {i}, retrieving {len(schedule)} rows')
            print(f'Schedule Function Completed for {i}, retrieving {len(schedule)} rows')
            completed_months.append(i)
            schedule_df = schedule_df.append(schedule)
        
        schedule_df = schedule_df[['Start (ET)', 'Visitor/Neutral', 'Home/Neutral', 'Date']]
        schedule_df['proper_date'] = pd.to_datetime(schedule_df['Date']).dt.date
        schedule_df.columns = schedule_df.columns.str.lower()
        schedule_df = schedule_df.rename(columns = {"start (et)": "start_time", "visitor/neutral": "away_team", "home/neutral": "home_team"})

        logging.info(f"Schedule Function Completed for {' '.join(completed_months)}, retrieving {len(schedule_df)} total rows")
        print(f"Schedule Function Completed for {' '.join(completed_months)}, retrieving {len(schedule_df)} total rows")
        return schedule_df
    except IndexError as index_error:
        logging.info(f"{i} currently has no data in basketball-reference, stopping the function and returning data for {' '.join(completed_months)}")
        print(f"{i} currently has no data in basketball-reference, stopping the function and returning data for {' '.join(completed_months)}")
        schedule_df = schedule_df[['Start (ET)', 'Visitor/Neutral', 'Home/Neutral', 'Date']]
        schedule_df['proper_date'] = pd.to_datetime(schedule_df['Date']).dt.date
        schedule_df.columns = schedule_df.columns.str.lower()
        schedule_df = schedule_df.rename(columns = {"start (et)": "start_time", "visitor/neutral": "away_team", "home/neutral": "home_team"})
        return schedule_df
    except BaseException as e:
        logging.info(f"Schedule Scraper Function Failed, {e}")
        print(f"Schedule Scraper Function Failed, {e}")
        df = []
        return(df)

schedule_data = schedule_scraper('2022', ['april', 'may', 'june'])

Schedule Function Completed for april, retrieving 80 rows
may currently has no data in basketball-reference, stopping the function and returning data for april
