In [2]:
from datetime import datetime, timedelta
import os
import uuid
from typing import List

import awswrangler as wr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import psycopg2
from sqlalchemy import exc, create_engine

yesterday = datetime.now() - timedelta(days=1)

os.chdir('..')

os.getcwd()

'/home/jacob/Documents/python_aws'

In [3]:
def write_to_sql(con, table_name: str, df: pd.DataFrame, table_type: str) -> None:
    """
    SQL Table function to write a pandas data frame in aws_dfname_source format
    Args:
        con (SQL Connection): The connection to the SQL DB.
        table_name (str): The Table name to write to SQL as.
        df (DataFrame): The Pandas DataFrame to store in SQL
        table_type (str): Whether the table should replace or append to an existing SQL Table under that name
    Returns:
        Writes the Pandas DataFrame to a Table in Snowflake in the {nba_source} Schema we connected to.
    """
    try:
        if len(df) == 0:
            print(f"{table_name} is empty, not writing to SQL")
        else:
            df.to_sql(
                con=con,
                name=f"aws_{table_name}_source",
                index=False,
                if_exists=table_type,
            )
            print(
                f"Writing {len(df)} {table_name} rows to aws_{table_name}_source to SQL"
            )
    except BaseException as error:
        print(f"SQL Write Script Failed, {error}")

def sql_connection(rds_schema: str):
    """
    SQL Connection function connecting to my postgres db with schema = nba_source where initial data in ELT lands.
    Args:
        rds_schema (str): The Schema in the DB to connect to.
    Returns:
        SQL Connection variable to a specified schema in my PostgreSQL DB
    """
    RDS_USER = os.environ.get("RDS_USER")
    RDS_PW = os.environ.get("RDS_PW")
    RDS_IP = os.environ.get("IP")
    RDS_DB = os.environ.get("RDS_DB")
    try:
        connection = create_engine(
            f"postgresql+psycopg2://{RDS_USER}:{RDS_PW}@{RDS_IP}:5432/{RDS_DB}",
            connect_args={"options": f"-csearch_path={rds_schema}"},
            # defining schema to connect to
            echo=False,
        )
        print(f"SQL Connection to schema: {rds_schema} Successful")
        return connection
    except exc.SQLAlchemyError as e:
        return e

conn = sql_connection(rds_schema='nba_source')

SQL Connection to schema: nba_source Successful


In [3]:
boxscores = pd.read_parquet('sql_transfer/boxscores.parquet')
write_to_sql(conn, 'boxscores', boxscores, 'append')

Writing 30219 boxscores rows to aws_boxscores_source to SQL


In [4]:
adv_stats = pd.read_parquet('sql_transfer/adv_stats.parquet')
write_to_sql(conn, 'adv_stats', adv_stats, 'append')

Writing 5640 adv_stats rows to aws_adv_stats_source to SQL


In [5]:
contracts = pd.read_parquet('sql_transfer/contracts.parquet')
write_to_sql(conn, 'contracts', contracts, 'append')

Writing 515 contracts rows to aws_contracts_source to SQL


In [6]:
injury_data = pd.read_parquet('sql_transfer/injury_data.parquet')
write_to_sql(conn, 'injury_data', injury_data, 'append')

Writing 59 injury_data rows to aws_injury_data_source to SQL


In [7]:
odds = pd.read_parquet('sql_transfer/odds.parquet')
write_to_sql(conn, 'odds', odds, 'append')

Writing 2787 odds rows to aws_odds_source to SQL


In [19]:
ts = pd.Timestamp
opp_stats = pd.read_parquet('sql_transfer/opp_stats.parquet').query('scrape_date == @ts("2022-04-11T00:00:00.000Z")')

write_to_sql(conn, 'opp_stats', opp_stats, 'replace')

  result = libops.scalar_compare(x.ravel(), y, op)


Writing 30 opp_stats rows to aws_opp_stats_source to SQL


In [14]:
pbp_data = pd.read_parquet('sql_transfer/pbp_data.parquet')
write_to_sql(conn, 'pbp_data', pbp_data, 'append')

Writing 175996 pbp_data rows to aws_pbp_data_source to SQL


In [15]:
player_attributes = pd.read_parquet('sql_transfer/player_attributes.parquet')
write_to_sql(conn, 'player_attributes', player_attributes, 'append')

Writing 595 player_attributes rows to aws_player_attributes_source to SQL


In [16]:
preseason_odds = pd.read_parquet('sql_transfer/preseason_odds.parquet')
write_to_sql(conn, 'preseason_odds', preseason_odds, 'append')

Writing 30 preseason_odds rows to aws_preseason_odds_source to SQL


In [4]:
reddit_comment_data = pd.read_parquet('sql_transfer/reddit_comments.parquet')
write_to_sql(conn, 'reddit_comment_data', reddit_comment_data, 'replace')

Writing 1548999 reddit_comment_data rows to aws_reddit_comment_data_source to SQL


In [3]:
reddit_data = pd.read_parquet('sql_transfer/reddit_data.parquet')
write_to_sql(conn, 'reddit_data', reddit_data, 'append')

Writing 6183 reddit_data rows to aws_reddit_data_source to SQL


In [6]:
schedule = pd.read_parquet('sql_transfer/schedule.parquet').drop_duplicates()
write_to_sql(conn, 'schedule', schedule, 'replace')

Writing 1397 schedule rows to aws_schedule_source to SQL


In [9]:
shooting_stats = pd.read_parquet('sql_transfer/shooting_stats.parquet').query('scrape_date == scrape_date.max()')
write_to_sql(conn, 'shooting_stats', shooting_stats, 'replace')

Writing 605 shooting_stats rows to aws_shooting_stats_source to SQL


In [12]:
stats = pd.read_parquet('sql_transfer/stats.parquet').query('scrape_date == scrape_date.max()')
write_to_sql(conn, 'stats', stats, 'append')

Writing 812 stats rows to aws_stats_source to SQL


In [13]:
team_attributes = pd.read_parquet('sql_transfer/team_attributes.parquet')
write_to_sql(conn, 'team_attributes', team_attributes, 'append')

Writing 30 team_attributes rows to aws_team_attributes_source to SQL


In [14]:
transactions = pd.read_parquet('sql_transfer/transactions.parquet')
write_to_sql(conn, 'transactions', transactions, 'append')

Writing 1935 transactions rows to aws_transactions_source to SQL


In [5]:
twitter_tweepy_data = pd.read_parquet('sql_transfer/twitter_tweepy.parquet')

write_to_sql(conn, 'twitter_tweepy_data', twitter_tweepy_data, 'replace')

Writing 155980 twitter_tweepy_data rows to aws_twitter_tweepy_data_source to SQL


In [6]:
twitter_data = pd.read_parquet('sql_transfer/twitter_tweets.parquet')

write_to_sql(conn, 'twitter_data', twitter_data, 'append')

Writing 261127 twitter_data rows to aws_twitter_data_source to SQL
