In [10]:
import boto3
import pandas as pd
from io import StringIO
import time

In [11]:
from dotenv import load_dotenv
import os
load_dotenv()
AWS_ACCESS_KEY = os.getenv('AWS_ACCESS_KEY')
AWS_SECRET_KEY = os.getenv('AWS_SECRET_KEY')
AWS_REGION = os.getenv('AWS_REGION')
SCHEMA_NAME = os.getenv('SCHEMA_NAME')
S3_STAGING_DIR = os.getenv('S3_STAGING_DIR')
S3_BUCKET_NAME = os.getenv('S3_BUCKET_NAME')
S3_OUTPUT_DIRETORY = os.getenv('S3_OUTPUT_DIRETORY')
S3_DIMENSION_CSV= os.getenv('S3_DIMENSION_CSV')

In [13]:
SCHEMA_NAME

'covid19_database'

In [12]:
# creates an Athena client object with AWS credentials and region information.
athena_client = boto3.client(
    'athena',
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=AWS_REGION
)

In [14]:
dict = {} 
def download_and_load_query_results (
    client: boto3.client, query_response: dict, table_name
) -> pd.DataFrame:
    """
    Download query results from Athena and load into a Pandas DataFrame.
    
    Args:
        client: boto3 client.
        query_response (dict): query response
        table_name: name of table in database
    Returns:
        pd.DataFrame: Pandas DataFrame containing the query results.
    """
    while True:
        try:
            # This func only loads the first 1000 rows
            client.get_query_results (
                QueryExecutionId=query_response["QueryExecutionId"]
            )
            break
        except Exception as err:
            if "not yet finished" in str(err):
                time.sleep(0.001)
            else:
                raise err
    temp_file_location: str = f"./query_result/{table_name}_results.csv"
    s3_client = boto3.client(
        "s3",
        aws_access_key_id=AWS_ACCESS_KEY,
        aws_secret_access_key=AWS_SECRET_KEY,
        region_name=AWS_REGION,
    )
    s3_client.download_file(
        S3_BUCKET_NAME,
        f"{S3_OUTPUT_DIRETORY}/{query_response['QueryExecutionId']}.csv",
        temp_file_location,
    )
    return pd.read_csv(temp_file_location)

In [18]:
response = athena_client.start_query_execution(
    QueryString="SELECT * FROM static_dataset_state_abv",
    QueryExecutionContext={"Database": SCHEMA_NAME},
    ResultConfiguration={
        "OutputLocation": S3_STAGING_DIR,
        "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
    },
)

In [19]:
response

{'QueryExecutionId': 'd5c5822a-9b78-41b1-8ddb-6294cb1ff943',
 'ResponseMetadata': {'RequestId': '30f664ec-61b2-4203-a56c-5fccc2f90d34',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Tue, 25 Apr 2023 03:28:44 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '59',
   'connection': 'keep-alive',
   'x-amzn-requestid': '30f664ec-61b2-4203-a56c-5fccc2f90d34'},
  'RetryAttempts': 0}}

In [20]:
static_dataset_state_abv = download_and_load_query_results(athena_client, response, 'static_dataset_state_abv')

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [8]:
df_data = download_and_load_query_results(athena_client, response, 'enigma_jhu')

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [14]:
def get_table_names_in_database():
    """
    Retrieves the names of all tables in the Athena database specified by the SCHEMA_NAME environment variable.

    Returns:

    table_names (list): A list of table names in the Athena database.
    """
    table_response = athena_client.start_query_execution(
        QueryString=f"SHOW TABLES IN {SCHEMA_NAME}",
        QueryExecutionContext={"Database": SCHEMA_NAME},
        ResultConfiguration={
            "OutputLocation": S3_STAGING_DIR,
            "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
        },
    )
    # Get the query execution ID
    query_execution_id = table_response['QueryExecutionId']

    # Wait for the query to complete
    while True:
        status = athena_client.get_query_execution(QueryExecutionId=query_execution_id)['QueryExecution']['Status']['State']
        if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
            break

    # Get the query results
    results = athena_client.get_query_results(QueryExecutionId=query_execution_id)

    # Parse the table names
    table_names = []
    for row in results['ResultSet']['Rows'][0:]:
        table_names.append(row['Data'][0]['VarCharValue'])

    # Print the table names
    return table_names

In [15]:
def response_result_athena_query(table_name):
    """
    Executes a query to retrieve all rows from a specified table in the Athena database specified by the SCHEMA_NAME environment variable.

    Args:
        table_name (str): Name of the table to query.
    Returns:
        csv file: A csv file will be stored in query_result folder.
    """
    response = athena_client.start_query_execution(
        QueryString=f"SELECT * FROM {table_name}",
        QueryExecutionContext={"Database": SCHEMA_NAME},
        ResultConfiguration={
            "OutputLocation": S3_STAGING_DIR,
            "EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
        },
    )
    return download_and_load_query_results(athena_client, response, table_name)


In [16]:
query_result_dict = {}
table_names = get_table_names_in_database()
table_names

['enigma_jhu',
 'nytimes_data_in_usa_us_county',
 'nytimes_data_in_usa_us_states',
 'rearc_covid19_testing_states_daily',
 'rearc_covid19_testing_us_daily',
 'rearc_covid19_testing_us_total_latest',
 'rearc_usa_hospital_beds',
 'static_dataset_countrycode',
 'static_dataset_countypopulation',
 'static_dataset_state_abv']

In [17]:
for table_name in table_names:
    query_result_dict[f"{table_name}"] = response_result_athena_query(table_name)

ClientError: An error occurred (404) when calling the HeadObject operation: Not Found

In [None]:
query_result_dict['static_dataset_state_abv'].head()

Unnamed: 0,col0,col1
0,State,Abbreviation
1,Alabama,AL
2,Alaska,AK
3,Arizona,AZ
4,Arkansas,AR


In [None]:
# grab the first row for the new header
new_header = query_result_dict['static_dataset_state_abv'].iloc[0]
new_header

col0           State
col1    Abbreviation
Name: 0, dtype: object

In [14]:
# take the data less the header row
query_result_dict['static_dataset_state_abv'] = query_result_dict['static_dataset_state_abv'][1:]
query_result_dict['static_dataset_state_abv'].head()

Unnamed: 0,col0,col1
1,Alabama,AL
2,Alaska,AK
3,Arizona,AZ
4,Arkansas,AR
5,California,CA


In [15]:
# set the header row as the df header
query_result_dict['static_dataset_state_abv'].columns = new_header
query_result_dict['static_dataset_state_abv'].head()

Unnamed: 0,State,Abbreviation
1,Alabama,AL
2,Alaska,AK
3,Arizona,AZ
4,Arkansas,AR
5,California,CA


In [16]:
factCovid_1 = query_result_dict['enigma_jhu'][['fips', 'province_state', 'country_region', 'confirmed', 'deaths', 'recovered', 'active']]
factCovid_2 = query_result_dict['rearc_covid19_testing_states_daily'][['fips', 'date', 'positive', 'negative', 'hospitalizedcurrently', 'hospitalized', 'hospitalizeddischarged']]
factCovid = pd.merge(factCovid_1, factCovid_2, on='fips', how='inner')

In [17]:
factCovid.head()

Unnamed: 0,fips,province_state,country_region,confirmed,deaths,recovered,active,date,positive,negative,hospitalizedcurrently,hospitalized,hospitalizeddischarged
0,72.0,Puerto Rico,US,3.0,0.0,0.0,,20210307,101327.0,305972.0,147.0,,
1,72.0,Puerto Rico,US,3.0,0.0,0.0,,20210306,101327.0,305972.0,147.0,,
2,72.0,Puerto Rico,US,3.0,0.0,0.0,,20210305,101066.0,305972.0,136.0,,
3,72.0,Puerto Rico,US,3.0,0.0,0.0,,20210304,100867.0,305972.0,171.0,,
4,72.0,Puerto Rico,US,3.0,0.0,0.0,,20210303,100765.0,305972.0,169.0,,


In [18]:
factCovid.shape

(26418, 13)

In [19]:
dimRegion_1 = query_result_dict['enigma_jhu'][['fips', 'province_state', 'country_region', 'latitude', 'longitude']]
dimRegion_2 = query_result_dict['nytimes_data_in_usa_us_county'][['fips', 'county', 'state']]
dimRegion = pd.merge(dimRegion_1, dimRegion_2, on='fips', how='inner')

In [20]:
dimRegion.head()

Unnamed: 0,fips,province_state,country_region,latitude,longitude,county,state
0,,Anhui,China,31.826,117.226,New York City,New York
1,,Anhui,China,31.826,117.226,Unknown,Rhode Island
2,,Anhui,China,31.826,117.226,New York City,New York
3,,Anhui,China,31.826,117.226,Unknown,Rhode Island
4,,Anhui,China,31.826,117.226,New York City,New York


In [21]:
dimHospital = query_result_dict['rearc_usa_hospital_beds'][['fips', 'state_name', 'latitude', 'longtitude', 'hq_address', 'hospital_type', 'hospital_name', 'hq_city', 'hq_state']]

In [22]:
dimDate = query_result_dict['rearc_covid19_testing_states_daily'][['fips', 'date']]

In [23]:
dimDate.head()

Unnamed: 0,fips,date
0,2,20210307
1,1,20210307
2,5,20210307
3,60,20210307
4,4,20210307


In [24]:
# date values are in int type, change to date format
dimDate['date'] = pd.to_datetime(dimDate['date'], format='%Y%m%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dimDate['date'] = pd.to_datetime(dimDate['date'], format='%Y%m%d')


In [25]:
dimDate.head()

Unnamed: 0,fips,date
0,2,2021-03-07
1,1,2021-03-07
2,5,2021-03-07
3,60,2021-03-07
4,4,2021-03-07


In [26]:
dimDate['year'] = dimDate['date'].dt.year
dimDate['month'] = dimDate['date'].dt.month
dimDate['day_of_weak'] = dimDate['date'].dt.dayofweek

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dimDate['year'] = dimDate['date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dimDate['month'] = dimDate['date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dimDate['day_of_weak'] = dimDate['date'].dt.dayofweek


In [27]:
dimDate.head()

Unnamed: 0,fips,date,year,month,day_of_weak
0,2,2021-03-07,2021,3,6
1,1,2021-03-07,2021,3,6
2,5,2021-03-07,2021,3,6
3,60,2021-03-07,2021,3,6
4,4,2021-03-07,2021,3,6


In [28]:
bucket = S3_DIMENSION_CSV # already created on S3

In [30]:
csv_buffer = StringIO()
csv_buffer

<_io.StringIO at 0x28ec731ba30>

In [31]:
factCovid.to_csv(csv_buffer)

In [33]:
s3_resource = boto3.resource('s3', aws_access_key_id=AWS_ACCESS_KEY, aws_secret_access_key=AWS_SECRET_KEY)
s3_resource.Object(bucket, 'output/factCovid.csv').put(Body=csv_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'XQHYMKHW7S54SVR8',
  'HostId': '8+6hofIjIbEg6pjuGjxm48TducFosjEOoLqAaYNbGDuNblrSauBk0A7PxvuOmlJg9m1TVwGp/Nw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': '8+6hofIjIbEg6pjuGjxm48TducFosjEOoLqAaYNbGDuNblrSauBk0A7PxvuOmlJg9m1TVwGp/Nw=',
   'x-amz-request-id': 'XQHYMKHW7S54SVR8',
   'date': 'Thu, 20 Apr 2023 17:43:05 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"6372f86ce96858dcb69570e9500bc3c7"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"6372f86ce96858dcb69570e9500bc3c7"',
 'ServerSideEncryption': 'AES256'}

In [34]:
dimRegion_buffer = StringIO()
dimRegion.to_csv(dimRegion_buffer)

In [35]:

s3_resource.Object(bucket, 'output/dimRegion.csv').put(Body=dimRegion_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'WM1YZ12B03EMTA8J',
  'HostId': 'A4eYJRfuzSYSyzZb1J/dWkaH+P9TmqAq4+wQ8D1ru6CJlfmjtwgCRxdxHlkv8KfDv86Kn9ta4uE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'A4eYJRfuzSYSyzZb1J/dWkaH+P9TmqAq4+wQ8D1ru6CJlfmjtwgCRxdxHlkv8KfDv86Kn9ta4uE=',
   'x-amz-request-id': 'WM1YZ12B03EMTA8J',
   'date': 'Thu, 20 Apr 2023 17:48:10 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"245ccb76e9ea4c1763ece831bc3922a4"',
   'server': 'AmazonS3',
   'content-length': '0',
   'connection': 'close'},
  'RetryAttempts': 0},
 'ETag': '"245ccb76e9ea4c1763ece831bc3922a4"',
 'ServerSideEncryption': 'AES256'}

In [36]:
dimHospital_buffer = StringIO()
dimHospital.to_csv(dimHospital_buffer)
s3_resource.Object(bucket, 'output/dimHospital.csv').put(Body=dimHospital_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'N5M2GYS7G90ZGD0B',
  'HostId': 'fZOWt7IwFzATV9fWk7TyioKcfG9bsV9WxvI8/XJw7W/cy3bdVQ0RsWJfRWi+Om0EbTu/O12KPv4=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'fZOWt7IwFzATV9fWk7TyioKcfG9bsV9WxvI8/XJw7W/cy3bdVQ0RsWJfRWi+Om0EbTu/O12KPv4=',
   'x-amz-request-id': 'N5M2GYS7G90ZGD0B',
   'date': 'Thu, 20 Apr 2023 18:29:51 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"3e09cec2d0c1cc0b01a8182608cfa78d"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"3e09cec2d0c1cc0b01a8182608cfa78d"',
 'ServerSideEncryption': 'AES256'}

In [37]:
dimDate_buffer = StringIO()
dimDate.to_csv(dimDate_buffer)
s3_resource.Object(bucket, 'output/dimDate.csv').put(Body=dimDate_buffer.getvalue())

{'ResponseMetadata': {'RequestId': 'QVCBFZKQ1DNCCQKW',
  'HostId': 'VNMX+BKOUJ9bl3W136cvea+WLhwkVD5Mz/28c7ST9hgmrMhZbQZ8gL4QeOm1CJ8/yw27RXo6fqE=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'VNMX+BKOUJ9bl3W136cvea+WLhwkVD5Mz/28c7ST9hgmrMhZbQZ8gL4QeOm1CJ8/yw27RXo6fqE=',
   'x-amz-request-id': 'QVCBFZKQ1DNCCQKW',
   'date': 'Thu, 20 Apr 2023 18:29:52 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"9bd4ef836f7121128313ada507cdd660"',
   'server': 'AmazonS3',
   'content-length': '0'},
  'RetryAttempts': 0},
 'ETag': '"9bd4ef836f7121128313ada507cdd660"',
 'ServerSideEncryption': 'AES256'}

In [39]:
dimDateSql = pd.io.sql.get_schema(dimDate.reset_index(), 'dimDate')
print(''.join(dimDateSql))

CREATE TABLE "dimDate" (
"index" INTEGER,
  "fips" INTEGER,
  "date" TIMESTAMP,
  "year" INTEGER,
  "month" INTEGER,
  "day_of_weak" INTEGER
)


In [38]:
dimRegionSql = pd.io.sql.get_schema(dimRegion.reset_index(), 'dimRegion')
print(''.join(dimRegionSql))

CREATE TABLE "dimRegion" (
"index" INTEGER,
  "fips" REAL,
  "province_state" TEXT,
  "country_region" TEXT,
  "latitude" REAL,
  "longitude" REAL,
  "county" TEXT,
  "state" TEXT
)


In [40]:
dimHospitalSql = pd.io.sql.get_schema(dimHospital.reset_index(), 'dimHospital')
print(''.join(dimHospitalSql))

CREATE TABLE "dimHospital" (
"index" INTEGER,
  "fips" REAL,
  "state_name" TEXT,
  "latitude" REAL,
  "longtitude" REAL,
  "hq_address" TEXT,
  "hospital_type" TEXT,
  "hospital_name" TEXT,
  "hq_city" TEXT,
  "hq_state" TEXT
)


In [41]:
factCovidSql = pd.io.sql.get_schema(factCovid.reset_index(), 'factCovid')
print(''.join(factCovidSql))

CREATE TABLE "factCovid" (
"index" INTEGER,
  "fips" REAL,
  "province_state" TEXT,
  "country_region" TEXT,
  "confirmed" REAL,
  "deaths" REAL,
  "recovered" REAL,
  "active" REAL,
  "date" INTEGER,
  "positive" REAL,
  "negative" REAL,
  "hospitalizedcurrently" REAL,
  "hospitalized" REAL,
  "hospitalizeddischarged" REAL
)
