In [1]:
import pandas as pd
import boto3
import awswrangler as wr
from datetime import datetime
from sqlalchemy import exc, create_engine
from sqlalchemy.engine.base import Engine
import os

possible_buckets = ['adv_stats', 'boxscores', 'injury_data', 'odds', 'opp_stats', 'pbp_data', 'reddit_comment_data', 'reddit_data', 'schedule', 'shooting_stats', 'stats', 'transactions', 'twitter_data', 'twitter_tweepy_data']
# s3_resource = boto3.resource('s3')
# extension = ("parquet", "csv")
# bucket='jacobsbucket97'
# prefix = "reddit_data/"

In [3]:
def sql_connection(rds_schema: str) -> Engine:
    """
    SQL Connection function to define the SQL Driver + connection variables needed to connect to the DB.
    This doesn't actually make the connection, use conn.connect() in a context manager to create 1 re-usable connection
    Args:
        rds_schema (str): The Schema in the DB to connect to.
    Returns:
        SQL Connection variable to a specified schema in my PostgreSQL DB
    """
    RDS_USER = os.environ.get("RDS_USER")
    RDS_PW = os.environ.get("RDS_PW")
    RDS_IP = os.environ.get("IP")
    RDS_DB = os.environ.get("RDS_DB")
    try:
        connection = create_engine(
            f"postgresql+psycopg2://{RDS_USER}:{RDS_PW}@{RDS_IP}:5432/{RDS_DB}",
            connect_args={"options": f"-csearch_path={rds_schema}"},
            # defining schema to connect to
            echo=False,
        )
        print(f"SQL Connection to schema: {rds_schema} Successful")
        return connection
    except exc.SQLAlchemyError as e:
        print(f"SQL Connection to schema: {rds_schema} Failed, Error: {e}")
        return e

def write_to_sql(con, table_name: str, df: pd.DataFrame, table_type: str) -> None:
    """
    SQL Table function to write a pandas data frame in aws_dfname_source format
    Args:
        con (SQL Connection): The connection to the SQL DB.
        table_name (str): The Table name to write to SQL as.
        df (DataFrame): The Pandas DataFrame to store in SQL
        table_type (str): Whether the table should replace or append to an existing SQL Table under that name
    Returns:
        Writes the Pandas DataFrame to a Table in Snowflake in the {nba_source} Schema we connected to.
    """
    try:
        if len(df) == 0:
            print(f"{table_name} is empty, not writing to SQL")
        else:
            df.to_sql(
                con=con,
                name=f"aws_{table_name}_source",
                index=False,
                if_exists=table_type,
            )
            print(
                f"Writing {len(df)} {table_name} rows to aws_{table_name}_source to SQL"
            )
    except BaseException as error:
        print(f"SQL Write Script Failed, {error}")

In [12]:
class PrefixException(Exception):
    pass

In [19]:
def reprocess_bucket(bucket: str, prefix: str, conn: Engine):
    try:
        if prefix.endswith('/'):
            raise PrefixException('Please Remove the trailing / on the prefix parameter')
        s3_resource = boto3.resource('s3')
        s3_keys = list(
            f"{bucket}/{i.key}" for i in s3_resource.Bucket(bucket).objects.filter(Prefix = F"{prefix}/")) # make sure it ends with /
        # s3_keys = s3_keys[:3]
        for i in s3_keys:
            print(f"Reading in {i}, storing to SQL ...")
            df = wr.s3.read_parquet(f"s3://{i}")
            write_to_sql(conn, prefix, df, 'append')
            
    except BaseException as e:
        print(f"Error Occurred, {e}")

In [20]:
# conn = sql_connection('nba_source_dev')

reprocess_bucket('jacobsbucket97', 'boxscores', conn)

Reading in jacobsbucket97/boxscores/validated/02/boxscores-2022-02-26.parquet ...
Storing jacobsbucket97/boxscores/validated/02/boxscores-2022-02-26.parquet to SQL ...
Writing 189 boxscores rows to aws_boxscores_source to SQL
Reading in jacobsbucket97/boxscores/validated/02/boxscores-2022-02-27.parquet ...
Storing jacobsbucket97/boxscores/validated/02/boxscores-2022-02-27.parquet to SQL ...
Writing 144 boxscores rows to aws_boxscores_source to SQL
Reading in jacobsbucket97/boxscores/validated/02/boxscores-2022-02-28.parquet ...
Storing jacobsbucket97/boxscores/validated/02/boxscores-2022-02-28.parquet to SQL ...
Writing 168 boxscores rows to aws_boxscores_source to SQL


In [2]:
boxscores_raw = list(
    f"{bucket}/{i.key}" for i in s3_resource.Bucket(bucket).objects.filter(Prefix = prefix)) # make sure it ends with /

# boxscores_raw = list(filter(lambda x: x.endswith(extension), boxscores_raw))
boxscores_raw

['jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-26.parquet',
 'jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-27.parquet',
 'jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-28.parquet',
 'jacobsbucket97/reddit_data/validated/02/reddit_data-2022-03-01.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-02.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-03.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-04.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-05.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-06.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-07.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-08.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-09.parquet',
 'jacobsbucket97/reddit_data/validated/03/reddit_data-2022-03-10.parquet',
 'jacobsbucket97/reddit_d

In [12]:
boxscores_raw2 = boxscores_raw[:3]
boxscores_raw2

['jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-26.parquet',
 'jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-27.parquet',
 'jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-28.parquet']

In [5]:
df = wr.s3.read_parquet('s3://jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-26.parquet')

In [13]:
conn = sql_connection('nba_source_dev')

SQL Connection to schema: nba_source_dev Successful


In [15]:
conn = sql_connection('nba_source_dev')
for i in boxscores_raw2:
    print(f"Reading in {i} ...")
    df = wr.s3.read_parquet(f"s3://{i}")
    print(f"Storing {i} to SQL ...")
    write_to_sql(conn, 'reddit_data', df, 'append')

Reading in jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-26.parquet ...
Storing jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-26.parquet to SQL ...
Writing 27 reddit_data rows to aws_reddit_data_source to SQL
Reading in jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-27.parquet ...
Storing jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-27.parquet to SQL ...
Writing 27 reddit_data rows to aws_reddit_data_source to SQL
Reading in jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-28.parquet ...
Storing jacobsbucket97/reddit_data/validated/02/reddit_data-2022-02-28.parquet to SQL ...
Writing 27 reddit_data rows to aws_reddit_data_source to SQL
