### Querying Athena and Glue through AWS Wrangler

In [13]:
#import below libraries
import awswrangler as wr
import pandas as pd
import boto3
import warnings 
warnings.filterwarnings('ignore')
import configparser

In [14]:
#reading the credentials securely.
credents = configparser.ConfigParser()

In [15]:
#use read_file method
credents.read_file(open('credentials.config'))

In [16]:
#Reading in the credentials into Python variables. No can see them
aws_key = credents["AWS"]["KEY"]
aws_secret = credents["AWS"]["SECRET"]
region = credents["AWS"]["REGION"]

In [17]:
#Creating the Session
your_session = boto3.Session(aws_access_key_id=aws_key,
                            aws_secret_access_key=aws_secret,
                            region_name=region)

In [18]:
destination_bucket = "s3://destination-folder"

In [19]:
wr.s3.list_objects(boto3_session=your_session,
                   path=destination_bucket)

['s3://destination-folder/earners/b9980b5cab0b43c3bb7665559e14a76d.csv',
 's3://destination-folder/jets/1da4dfabecb9438faa7d47cb1f46e7ee.csv']

### Writing pandas Dataframe to S3

In [20]:
#source location
!ls source_folder/

earners_jets.csv      top_earners_list.txt
top_earners_list.csv  top_earners_list.xlsx


In [21]:
earners_csv = pd.read_csv("source_folder/top_earners_list.csv")
earners_csv

Unnamed: 0,id,name,E-mail,Salary,occupation
0,1,Joel,Joel@Athena.com,187069,Mathematician
1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,3,Beatles,beatles@lambda.com,975682,Algorist
3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


In [22]:
earners_jets = pd.read_csv("source_folder/earners_jets.csv")
earners_jets

Unnamed: 0,id,personal_jet,distance_covered
0,1,Jet 777x,589865
1,2,Global 7500,7987856
2,3,Falcon 8X,5125768
3,4,Phenom 300,312687


## Create new database

In [23]:
wr.catalog.databases(boto3_session=your_session)

Unnamed: 0,Database,Description
0,default,Default Hive database
1,learning_db,
2,localhivedb,
3,youtube_data,


In [24]:
wr.catalog.create_database(name='learning_db',exist_ok=True,
                           boto3_session=your_session)

In [14]:
# Create the new database which will have no tables now
wr.catalog.tables(database='learning_db',\
                  boto3_session=your_session)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions


#### Writing the above 2 dataframes into the Glue Catalog

In [30]:
wr.s3.to_csv(df=earners_csv,
        path=destination_bucket + '/earners',
             database='learning_db',table='top_earners',
             dataset=True,
    boto3_session=your_session)

{'paths': ['s3://destination-folder/earners/d16cd6aaca6c4589badc29eef70e832a.csv'],
 'partitions_values': {}}

In [31]:
wr.s3.to_csv(df=earners_jets,
        path=destination_bucket + '/jets',
             database='learning_db',table='their_jets',
             dataset=True,
    boto3_session=your_session)

{'paths': ['s3://destination-folder/jets/2049a5146368455dac43924caab6d2a4.csv'],
 'partitions_values': {}}

In [28]:
wr.s3.delete_objects(path=destination_bucket,boto3_session=your_session)

In [32]:
# Verify the data is written using list_objects

wr.s3.list_objects(destination_bucket,boto3_session=your_session)

['s3://destination-folder/earners/d16cd6aaca6c4589badc29eef70e832a.csv',
 's3://destination-folder/jets/2049a5146368455dac43924caab6d2a4.csv']

### Bringing in Athena 

## Very important : These queries are chargeable by AWS.

1) Describing the above tables

In [33]:
wr.athena.describe_table(database='learning_db',table='their_jets',
                         boto3_session=your_session)

Unnamed: 0,Column Name,Type,Partition,Comment
0,__index_level_0__,bigint,False,
1,id,bigint,False,
2,_personal_jet,string,False,
3,_distance_covered,bigint,False,


In [34]:
wr.athena.describe_table(database='learning_db',table='top_earners',
                         boto3_session=your_session)

Unnamed: 0,Column Name,Type,Partition,Comment
0,__index_level_0__,bigint,False,
1,id,bigint,False,
2,name,string,False,
3,e_mail,string,False,
4,salary,bigint,False,
5,occupation,string,False,


In [35]:
wr.athena.read_sql_query(sql='SELECT * FROM top_earners',
                         boto3_session=your_session,database='learning_db')

Unnamed: 0,__index_level_0__,id,name,e_mail,salary,occupation
0,0,1,Joel,Joel@Athena.com,187069,Mathematician
1,1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,2,3,Beatles,beatles@lambda.com,975682,Algorist
3,3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


In [40]:
list_execution = wr.athena.list_query_executions(boto3_session=your_session)

In [41]:
#Number of queries to Athena
len(list_execution)

113

In [38]:
#Getting the last query_id
list_execution[0]

'fce9d9ec-0c32-4e26-854d-04929e022501'

In [39]:
wr.athena.get_query_results(boto3_session=your_session,
                            query_execution_id='fce9d9ec-0c32-4e26-854d-04929e022501')

Unnamed: 0,__index_level_0__,id,name,e_mail,salary,occupation
0,0,1,Joel,Joel@Athena.com,187069,Mathematician
1,1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,2,3,Beatles,beatles@lambda.com,975682,Algorist
3,3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


In [42]:
wr.athena.show_create_table(table='their_jets',
                            database='learning_db',
                           boto3_session=your_session)

"CREATE EXTERNAL TABLE `their_jets`( `__index_level_0__` bigint, `id` bigint, `_personal_jet` string, `_distance_covered` bigint) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' WITH SERDEPROPERTIES ( 'escape.delim'='\\\\') STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION 's3://destination-folder/jets' TBLPROPERTIES ( 'areColumnsQuoted'='false', 'classification'='csv', 'columnsOrdered'='true', 'compressionType'='none', 'delimiter'=',', 'projection.enabled'='false', 'typeOfData'='file')"

In [47]:
wr.s3.list_objects(path=destination_bucket+'/joined_table',
                  boto3_session=your_session)

['s3://destination-folder/joined_table/tables/98776d83-13f7-4563-86fe-ab5c46688d6c-manifest.csv',
 's3://destination-folder/joined_table/tables/98776d83-13f7-4563-86fe-ab5c46688d6c.metadata',
 's3://destination-folder/joined_table/temp_table_94fef8978bfc43feb930bea9b87dd2c2/20230223_060506_00011_c7z7e_75760ae5-8cf7-415b-a954-3d7c20ef5790']

In [43]:
joined_df = wr.athena.read_sql_query(sql="""SELECT t.id, t.e_mail,
                            salary,occupation,j._distance_covered, 
                            j._personal_jet
                            FROM their_jets as j
                            JOIN top_earners as t 
                            ON j.id = t.id""",
                        database='learning_db',
                        s3_output=destination_bucket+'/joined_table',
                         boto3_session=your_session)

In [44]:
joined_df

Unnamed: 0,id,e_mail,salary,occupation,_distance_covered,_personal_jet
0,1,Joel@Athena.com,187069,Mathematician,589865,Jet 777x
1,2,Afro@Glue.aws.in,752689,Physicist,7987856,Global 7500
2,3,beatles@lambda.com,975682,Algorist,5125768,Falcon 8X
3,4,snoopy@apigateway.com,752689,Artificial Rapper,312687,Phenom 300


In [13]:
wr.s3.list_objects(path=destination_bucket,
                   boto3_session=your_session)

[]

In [None]:
### This query will not work
#wr.athena.read_sql_query(sql='SELECT * FROM jets_earners',
 #                       boto3_session=your_session,
  #                       database='learning_db',
   #                      data_source=destination_bucket+"/joined_table")

In [45]:
wr.athena.unload(sql="""SELECT t.id, t.e_mail,
                            salary,occupation,j._distance_covered, 
                            j._personal_jet
                            FROM their_jets as j
                            JOIN top_earners as t 
                            ON j.id = t.id""",
                        database='learning_db',
                        path=destination_bucket+'/unload_location',
                         boto3_session=your_session)

_QueryMetadata(execution_id='f7b0821b-52b2-4b55-803e-4424e413b55d', dtype={'rows': 'Int64'}, parse_timestamps=[], parse_dates=[], converters={}, binaries=[], output_location='s3://destination-folder/unload_location/f7b0821b-52b2-4b55-803e-4424e413b55d', manifest_location='s3://destination-folder/unload_location/f7b0821b-52b2-4b55-803e-4424e413b55d-manifest.csv', raw_payload={'QueryExecutionId': 'f7b0821b-52b2-4b55-803e-4424e413b55d', 'Query': "UNLOAD (SELECT t.id, t.e_mail,\n                            salary,occupation,j._distance_covered, \n                            j._personal_jet\n                            FROM their_jets as j\n                            JOIN top_earners as t \n                            ON j.id = t.id) TO 's3://destination-folder/unload_location' WITH (  format='PARQUET')", 'StatementType': 'DML', 'ResultConfiguration': {'OutputLocation': 's3://destination-folder/unload_location/f7b0821b-52b2-4b55-803e-4424e413b55d'}, 'QueryExecutionContext': {'Database': 'l

#### Cleaning up the files

In [50]:
wr.catalog.delete_database(boto3_session=your_session,
                           name='learning_db')

In [52]:
wr.s3.delete_objects(path=destination_bucket,boto3_session=your_session)

In [46]:
# Verify the data is written using list_objects

wr.s3.list_objects(destination_bucket+"/unload_location",boto3_session=your_session)

['s3://destination-folder/unload_location/20230223_071448_00027_5d2kf_e8525a41-f7ec-4926-a355-b547150c9d0c',
 's3://destination-folder/unload_location/f7b0821b-52b2-4b55-803e-4424e413b55d-manifest.csv',
 's3://destination-folder/unload_location/f7b0821b-52b2-4b55-803e-4424e413b55d.metadata']

In [47]:
new_parquet = destination_bucket + '/unload_location/20230223_071448_00027_5d2kf_e8525a41-f7ec-4926-a355-b547150c9d0c'
joined_table = wr.s3.read_parquet(path=new_parquet,boto3_session=your_session)
joined_table

Unnamed: 0,id,e_mail,salary,occupation,_distance_covered,_personal_jet
0,1,Joel@Athena.com,187069,Mathematician,589865,Jet 777x
1,2,Afro@Glue.aws.in,752689,Physicist,7987856,Global 7500
2,3,beatles@lambda.com,975682,Algorist,5125768,Falcon 8X
3,4,snoopy@apigateway.com,752689,Artificial Rapper,312687,Phenom 300


In [49]:
wr.s3.delete_objects(path=destination_bucket,boto3_session=your_session)