In [3]:
import boto3
import pandas as pd
import pyspark

from helper_functions import create_session

In [4]:
# Instantiate glue client object
session = create_session()
glue = session.client('glue')

In [6]:
# Get list of tables in Glue database
database_name = 'aws-covid-project'
bucket_name = 'kc-covid-project'
table_tuple = tuple([table['Name'] for table in glue.search_tables()['TableList']])

In [15]:
# Query schemas for tables using Athena, save to S3
athena = session.client('athena')
try:
    query = f'SELECT * FROM INFORMATION_SCHEMA.columns WHERE table_name IN {(table_tuple)}'
    res = athena.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database':database_name},
        ResultConfiguration={
            'OutputLocation':'s3://kc-covid-project/athena_outputs/',
            'EncryptionConfiguration':{'EncryptionOption':'SSE_S3'}
                            },
    )
    print(f'Status Code: {res["ResponseMetadata"]["HTTPStatusCode"]}')
except Exception as e:
    print(e)

Status Code: 200


In [59]:
# Terminal command create empty csv file to save schema file to
! type nul > athena_outputs/table_schemas.csv

In [None]:
# Download schema file from s3 bucket, load into Pandas
s3 = session.client('s3')
try:
    schema_csv = s3.list_objects(
        Bucket=bucket_name,
        Prefix='athena_outputs'
    )['Contents'][0]['Key']
    # print(schema_csv)
    res = s3.download_file(
            Bucket=bucket_name,
            Key=schema_csv,
            Filename='athena_outputs/table_schemas.csv'
        )
    print(f'Status Code: {res["ResponseMetadata"]["HTTPStatusCode"]}')
except Exception as e:
    print(e)

In [14]:
# After loading schema file into Pandas DataFrame, use DataFrame below to design data model
# Alternatively, generate DDL from tables in Glue database
table_schemas = pd.read_csv('athena_outputs/table_schemas.csv', index_col=None)
table_schemas

Unnamed: 0,table_catalog,table_schema,table_name,column_name,ordinal_position,column_default,is_nullable,data_type,comment,extra_info
0,awsdatacatalog,aws-covid-project,enigma_nyt_usa_states,date,1,,YES,varchar,,
1,awsdatacatalog,aws-covid-project,enigma_nyt_usa_states,state,2,,YES,varchar,,
2,awsdatacatalog,aws-covid-project,enigma_nyt_usa_states,fips,3,,YES,bigint,,
3,awsdatacatalog,aws-covid-project,enigma_nyt_usa_states,cases,4,,YES,bigint,,
4,awsdatacatalog,aws-covid-project,enigma_nyt_usa_states,deaths,5,,YES,bigint,,
...,...,...,...,...,...,...,...,...,...,...
153,awsdatacatalog,aws-covid-project,rearc_usa_latest_total,hospitalized,14,,YES,bigint,,
154,awsdatacatalog,aws-covid-project,rearc_usa_latest_total,total,15,,YES,bigint,,
155,awsdatacatalog,aws-covid-project,rearc_usa_latest_total,totaltestresults,16,,YES,bigint,,
156,awsdatacatalog,aws-covid-project,rearc_usa_latest_total,posneg,17,,YES,bigint,,


In [42]:
s3_resource = session.resource('s3')
project_bucket = s3_resource.Bucket(bucket_name)
objs = project_bucket.objects.filter(Prefix='athena_outputs')
for obj in objs:
    pass

In [15]:
s3.download_file(
            Bucket=bucket_name,
            Key='static-state-codes',
            Filename='athena_outputs/state_codes.csv'
        )