In [1]:
from datetime import datetime, timedelta
import os
import uuid
from typing import List

import awswrangler as wr
import requests
from bs4 import BeautifulSoup
import pandas as pd
import psycopg2
from sqlalchemy import exc, create_engine

yesterday = datetime.now() - timedelta(days=1)

In [2]:
def write_to_sql(con, table_name: str, df: pd.DataFrame, table_type: str) -> None:
    """
    SQL Table function to write a pandas data frame in aws_dfname_source format
    Args:
        con (SQL Connection): The connection to the SQL DB.
        table_name (str): The Table name to write to SQL as.
        df (DataFrame): The Pandas DataFrame to store in SQL
        table_type (str): Whether the table should replace or append to an existing SQL Table under that name
    Returns:
        Writes the Pandas DataFrame to a Table in Snowflake in the {nba_source} Schema we connected to.
    """
    try:
        if len(df) == 0:
            print(f"{table_name} is empty, not writing to SQL")
        else:
            df.to_sql(
                con=con,
                name=f"aws_{table_name}_source",
                index=False,
                if_exists=table_type,
            )
            print(
                f"Writing {len(df)} {table_name} rows to aws_{table_name}_source to SQL"
            )
    except BaseException as error:
        print(f"SQL Write Script Failed, {error}")

def sql_connection(rds_schema: str):
    """
    SQL Connection function connecting to my postgres db with schema = nba_source where initial data in ELT lands.
    Args:
        rds_schema (str): The Schema in the DB to connect to.
    Returns:
        SQL Connection variable to a specified schema in my PostgreSQL DB
    """
    RDS_USER = os.environ.get("RDS_USER")
    RDS_PW = os.environ.get("RDS_PW")
    RDS_IP = os.environ.get("IP")
    RDS_DB = os.environ.get("RDS_DB")
    try:
        connection = create_engine(
            f"postgresql+psycopg2://{RDS_USER}:{RDS_PW}@{RDS_IP}:5432/{RDS_DB}",
            connect_args={"options": f"-csearch_path={rds_schema}"},
            # defining schema to connect to
            echo=False,
        )
        print(f"SQL Connection to schema: {rds_schema} Successful")
        return connection
    except exc.SQLAlchemyError as e:
        return e

conn = sql_connection(rds_schema='nba_source')

SQL Connection to schema: nba_source Successful


In [4]:
boxscores = pd.read_sql('select * from aws_boxscores_source', conn)
boxscores.to_parquet('sql_transfer/boxscores.parquet')

In [None]:
df = pd.read_sql('select * from aws_adv_stats_source', conn)
df.to_parquet('sql_transfer/adv_stats.parquet')

In [None]:
df = pd.read_sql('select * from aws_injury_data_source', conn)
df.to_parquet('sql_transfer/injury_data.parquet')

In [None]:
df = pd.read_sql('select * from aws_odds_source', conn)
df.to_parquet('sql_transfer/odds.parquet')

In [14]:
df = pd.read_sql('select * from aws_pbp_data_source', conn)
df.to_parquet('sql_transfer/pbp_data.parquet')

In [15]:
df = pd.read_sql('select * from aws_preseason_odds_source', conn)
df.to_parquet('sql_transfer/preseason_odds.parquet')

In [16]:
df = pd.read_sql('select * from aws_reddit_comment_data_source', conn)
df.to_parquet('sql_transfer/reddit_comments.parquet')

In [17]:
df = pd.read_sql('select * from aws_reddit_data_source', conn)
df.to_parquet('sql_transfer/reddit_data.parquet')

In [18]:
df = pd.read_sql('select * from aws_schedule_source', conn)
df.to_parquet('sql_transfer/schedule.parquet')

In [19]:
df = pd.read_sql('select * from aws_shooting_stats_source', conn)
df.to_parquet('sql_transfer/shooting_stats.parquet')

In [20]:
df = pd.read_sql('select * from aws_stats_source', conn)
df.to_parquet('sql_transfer/stats.parquet')

In [21]:
df = pd.read_sql('select * from aws_transactions_source', conn)
df.to_parquet('sql_transfer/transactions.parquet')

In [22]:
df = pd.read_sql('select * from aws_twitter_data_source', conn)
df.to_parquet('sql_transfer/twitter_tweets.parquet')

In [23]:
df = pd.read_sql('select * from aws_twitter_tweepy_data_source', conn)
df.to_parquet('sql_transfer/twitter_tweepy.parquet')

In [24]:
df = pd.read_sql('select * from staging_seed_player_attributes', conn)
df.to_parquet('sql_transfer/player_attributes.parquet')

In [25]:
df = pd.read_sql('select * from staging_seed_team_attributes', conn)
df.to_parquet('sql_transfer/team_attributes.parquet')