### Querying Athena and Glue through AWS Wrangler

In [40]:
#import below libraries
import awswrangler as wr
import pandas as pd
import boto3
import warnings 
warnings.filterwarnings('ignore')
import configparser

In [41]:
#reading the credentials securely.
credents = configparser.ConfigParser()

In [42]:
#use read_file method
credents.read_file(open('credentials.config'))

In [43]:
#Reading in the credentials into Python variables. No can see them
aws_key = credents["AWS"]["KEY"]
aws_secret = credents["AWS"]["SECRET"]
region = credents["AWS"]["REGION"]

In [44]:
#Creating the Session
your_session = boto3.Session(aws_access_key_id=aws_key,
                            aws_secret_access_key=aws_secret,
                            region_name=region)

In [45]:
destination_bucket = "s3://destination-folder"

In [46]:
wr.s3.list_objects(boto3_session=your_session,
                   path=destination_bucket)

[]

### Writing pandas Dataframe to S3

In [47]:
earners_csv = pd.read_csv("source_folder/top_earners_list.csv")
earners_csv

Unnamed: 0,id,name,E-mail,Salary,occupation
0,1,Joel,Joel@Athena.com,187069,Mathematician
1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,3,Beatles,beatles@lambda.com,975682,Algorist
3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


In [48]:
earners_jets = pd.read_csv("source_folder/earners_jets.csv")
earners_jets

Unnamed: 0,id,personal_jet,distance_covered
0,1,Jet 777x,589865
1,2,Global 7500,7987856
2,3,Falcon 8X,5125768
3,4,Phenom 300,312687


## Create new database

In [49]:
wr.catalog.databases(boto3_session=your_session)

Unnamed: 0,Database,Description
0,default,Default Hive database
1,localhivedb,
2,youtube_data,


In [50]:
wr.catalog.create_database(name='learning_db',
                           boto3_session=your_session)

In [51]:
# Create the new database which will have no tables now
wr.catalog.tables(database='learning_db',\
                  boto3_session=your_session)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions


#### Writing the above 2 dataframes into the Glue Catalog

In [52]:
wr.s3.to_csv(df=earners_csv,
        path=destination_bucket + '/earners',
             database='learning_db',table='top_earners',
             dataset=True,
    boto3_session=your_session)

{'paths': ['s3://destination-folder/earners/82cfca54cb284ceda3db98cdc902630c.csv'],
 'partitions_values': {}}

In [53]:
wr.s3.to_csv(df=earners_jets,
        path=destination_bucket + '/jets',
             database='learning_db',table='their_jets',
             dataset=True,
    boto3_session=your_session)

{'paths': ['s3://destination-folder/jets/cdc4756cc9a345d8b80c8ad867fcb7b8.csv'],
 'partitions_values': {}}

In [24]:
# Verify the data is written using list_objects

wr.s3.list_objects(destination_bucket,boto3_session=your_session)

['s3://destination-folder/csv/3eeab4db07e14a9c8b3ffe013a1db840.csv',
 's3://destination-folder/top_earners/1e3c702e9a434ccb894691c16c9425ea.csv']

### Bringing in Athena 

## Very important : These queries are chargeable by AWS.

1) Describing the above tables

In [25]:
wr.athena.describe_table(database='learning_db',table='their_jets',
                         boto3_session=your_session)

Unnamed: 0,Column Name,Type,Partition,Comment
0,__index_level_0__,bigint,False,
1,id,bigint,False,
2,_personal_jet,string,False,
3,_distance_covered,bigint,False,


In [26]:
wr.athena.describe_table(database='learning_db',table='top_earners',
                         boto3_session=your_session)

Unnamed: 0,Column Name,Type,Partition,Comment
0,__index_level_0__,bigint,False,
1,id,bigint,False,
2,name,string,False,
3,e_mail,string,False,
4,salary,bigint,False,
5,occupation,string,False,


In [26]:
wr.athena.read_sql_query(sql='SELECT * FROM top_earners',
                         boto3_session=your_session,database='learning_db')

Unnamed: 0,Column Name,Type,Partition,Comment
0,__index_level_0__,bigint,False,
1,id,bigint,False,
2,name,string,False,
3,e_mail,string,False,
4,salary,bigint,False,
5,occupation,string,False,


In [27]:
list_execution = wr.athena.list_query_executions(boto3_session=your_session)

In [28]:
#Number of queries to Athena
len(list_execution)

96

In [38]:
#Getting the last query_id
list_execution[-1]

'b32bf2d0-f573-4168-8316-8a1ac836ae88'

In [None]:
wr.athena.get_query_results(boto3_session=your_session,
                            query_execution_id='b32bf2d0-f573-4168-8316-8a1ac836ae88')

In [23]:
wr.athena.show_create_table(table='their_jets',
                            database='learning_db',
                           boto3_session=your_session)

"CREATE EXTERNAL TABLE `their_jets`( `__index_level_0__` bigint, `id` bigint, `_personal_jet` string, `_distance_covered` bigint) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' WITH SERDEPROPERTIES ( 'escape.delim'='\\\\') STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 's3://destination-folder/csv' TBLPROPERTIES ( 'areColumnsQuoted'='false', 'classification'='csv', 'columnsOrdered'='true', 'compressionType'='none', 'delimiter'=',', 'projection.enabled'='false', 'typeOfData'='file')"

In [33]:
joined_df = wr.athena.read_sql_query(sql="""SELECT t.id, t.e_mail,
                            salary,occupation,j._distance_covered, 
                            j._personal_jet
                            FROM their_jets as j
                            JOIN top_earners as t 
                            ON j.id = t.id""",
                        database='learning_db',
                        s3_output=destination_bucket+'/joined_table',
                         boto3_session=your_session)

In [11]:
joined_df

Unnamed: 0,id,e_mail,salary,occupation,_distance_covered,_personal_jet
0,1,Joel@Athena.com,187069,Mathematician,589865,Jet 777x
1,2,Afro@Glue.aws.in,752689,Physicist,7987856,Global 7500
2,3,beatles@lambda.com,975682,Algorist,5125768,Falcon 8X
3,4,snoopy@apigateway.com,752689,Artificial Rapper,312687,Phenom 300


In [12]:
wr.s3.delete_objects(path='s3://aws-athena-query-results-642924624251-us-east-1/',
                    boto3_session=your_session)

In [14]:
wr.s3.list_objects(path=destination_bucket,
                   boto3_session=your_session)

['s3://destination-folder/csv/29c8b74af64b431bbd322b3e07b1ae51.csv',
 's3://destination-folder/csv/54579cdbd6ff4760b51509e1000e24bf.csv',
 's3://destination-folder/earners/e62bc8ed300843cf9aaf3d3b9c25477e.csv',
 's3://destination-folder/jets/2272b2b4632b45ecb7273e08c580d1c0.csv',
 's3://destination-folder/joined_table/tables/2635b3b8-edfd-4cff-8440-65e1ddb1302e-manifest.csv',
 's3://destination-folder/joined_table/tables/2635b3b8-edfd-4cff-8440-65e1ddb1302e.metadata',
 's3://destination-folder/joined_table/tables/5ee36faf-8c29-4131-b323-5dd2bd65d9ae-manifest.csv',
 's3://destination-folder/joined_table/tables/5ee36faf-8c29-4131-b323-5dd2bd65d9ae.metadata',
 's3://destination-folder/joined_table/temp_table_2a473b62c37f425b8407129e5036ebf7/20230220_074610_00083_sumcn_25c9f988-b564-4df2-915f-003eeb384fb6',
 's3://destination-folder/joined_table/temp_table_9f9565dd91f7460698469c3e4c96fe4b/20230220_064823_00114_x2mwu_d87f3145-031c-4ce5-ad65-a376c1649aff']

In [None]:
### This query will not work
#wr.athena.read_sql_query(sql='SELECT * FROM jets_earners',
 #                       boto3_session=your_session,
  #                       database='learning_db',
   #                      data_source=destination_bucket+"/joined_table")

In [15]:
wr.athena.unload(sql="""SELECT t.id, t.e_mail,
                            salary,occupation,j._distance_covered, 
                            j._personal_jet
                            FROM their_jets as j
                            JOIN top_earners as t 
                            ON j.id = t.id""",
                        database='learning_db',
                        path=destination_bucket+'/unload_location',
                         boto3_session=your_session)

_QueryMetadata(execution_id='de61699d-6a18-46c6-8971-883447d5b384', dtype={'rows': 'Int64'}, parse_timestamps=[], parse_dates=[], converters={}, binaries=[], output_location='s3://destination-folder/unload_location/de61699d-6a18-46c6-8971-883447d5b384', manifest_location='s3://destination-folder/unload_location/de61699d-6a18-46c6-8971-883447d5b384-manifest.csv', raw_payload={'QueryExecutionId': 'de61699d-6a18-46c6-8971-883447d5b384', 'Query': "UNLOAD (SELECT t.id, t.e_mail,\n                            salary,occupation,j._distance_covered, \n                            j._personal_jet\n                            FROM their_jets as j\n                            JOIN top_earners as t \n                            ON j.id = t.id) TO 's3://destination-folder/unload_location' WITH (  format='PARQUET')", 'StatementType': 'DML', 'ResultConfiguration': {'OutputLocation': 's3://destination-folder/unload_location/de61699d-6a18-46c6-8971-883447d5b384'}, 'QueryExecutionContext': {'Database': 'l

#### Cleaning up the files

In [36]:
wr.catalog.delete_database(boto3_session=your_session,name='learning_db')

In [37]:
wr.s3.delete_objects(path=destination_bucket,boto3_session=your_session)

In [35]:
# Verify the data is written using list_objects

wr.s3.list_objects(destination_bucket+"/unload_location",boto3_session=your_session)

[]

In [19]:
new_parquet = destination_bucket + '/unload_location/20230220_075448_00032_zdr8m_f96cdc33-6a2f-4713-9d9c-fbb6358d1cf9'
joined_table = wr.s3.read_parquet(path=new_parquet,boto3_session=your_session)
joined_table

Unnamed: 0,id,e_mail,salary,occupation,_distance_covered,_personal_jet
0,1,Joel@Athena.com,187069,Mathematician,589865,Jet 777x
1,2,Afro@Glue.aws.in,752689,Physicist,7987856,Global 7500
2,3,beatles@lambda.com,975682,Algorist,5125768,Falcon 8X
3,4,snoopy@apigateway.com,752689,Artificial Rapper,312687,Phenom 300
