### Reading the files into AWS Wrangler and writing to S3

In [1]:
#import below libraries
import awswrangler as wr
import pandas as pd
import boto3
import warnings 
warnings.filterwarnings('ignore')
import configparser

In [2]:
#reading the credentials securely.
credents = configparser.ConfigParser()

In [3]:
#use read_file method
credents.read_file(open('credentials.config'))

In [4]:
#Reading in the credentials into Python variables. No can see them
aws_key = credents["AWS"]["KEY"]
aws_secret = credents["AWS"]["SECRET"]
region = credents["AWS"]["REGION"]

In [5]:
#Creating the Session
your_session = boto3.Session(aws_access_key_id=aws_key,
                            aws_secret_access_key=aws_secret,
                            region_name=region)

In [6]:
destination_bucket = "s3://destination-folder"

### Writing pandas Dataframe to S3

In [8]:
earners_csv = pd.read_csv("source_folder/top_earners_list.csv")
earners_csv

Unnamed: 0,id,name,E-mail,Salary,occupation
0,1,Joel,Joel@Athena.com,187069,Mathematician
1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,3,Beatles,beatles@lambda.com,975682,Algorist
3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


In [9]:
earners_jets = pd.read_csv("source_folder/earners_jets.csv")
earners_jets

Unnamed: 0,id,personal_jet,distance_covered
0,1,Jet 777x,589865
1,2,Global 7500,7987856
2,3,Falcon 8X,5125768
3,4,Phenom 300,312687


### Lets check the Glue Catalog

1. Check the glue catalog 

2. Check the database names

In [11]:
wr.catalog.databases(boto3_session=your_session)

Unnamed: 0,Database,Description
0,airbnbtables,
1,default,Default Hive database
2,localhivedb,
3,rdsdatabase,
4,tabmcqhoc,
5,youtube_data,


In [13]:
wr.catalog.tables(database='youtube_data',
                  boto3_session=your_session)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,youtube_data,combined_table,,EXTERNAL_TABLE,"video_id, trending_date, title, channel_title,...",
1,youtube_data,combined_table_yt,,EXTERNAL_TABLE,"cat_reg_count, category_id, region",
2,youtube_data,youtube_rawcsv,,EXTERNAL_TABLE,"video_id, trending_date, title, channel_title,...",region


In [15]:
wr.catalog.create_database(name='loading_file',
                           boto3_session=your_session)

In [17]:
# Create the new database which will have no tables now
wr.catalog.tables(database='loading_file',\
                  boto3_session=your_session)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions


#### Writing as CSV file and writing to glue Catalog

In [18]:
wr.s3.to_csv(df=earners_csv,
        path=destination_bucket + '/csv/write_top_earners.csv',
             database='loading_file',table='top_learners_csv',
             dataset=True,
    boto3_session=your_session)

{'paths': ['s3://destination-folder/csv/write_top_earners.csv/6b31a514686f478da14844d9f9a4d293.csv'],
 'partitions_values': {}}

In [19]:
# Verify the data is written using list_objects

wr.s3.list_objects(destination_bucket,boto3_session=your_session)

['s3://destination-folder/csv/write_top_earners.csv/6b31a514686f478da14844d9f9a4d293.csv',
 's3://destination-folder/top_earners_list.csv',
 's3://destination-folder/top_earners_list.txt']

In [21]:
wr.catalog.tables(database='loading_file',boto3_session=your_session)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,loading_file,top_learners_csv,,EXTERNAL_TABLE,"__index_level_0__, id, name, e_mail, salary, o...",


#### Writing as json file

In [23]:
wr.s3.to_json(df=earners_csv,
        path=destination_bucket + '/json',database='loading_file',
              dataset=True,table='top_learners_json',
    boto3_session=your_session)

{'paths': ['s3://destination-folder/json/9b98665ad1074525b542d1d38c0b325e.json'],
 'partitions_values': {}}

In [24]:
# Verify the objects

wr.s3.list_objects(destination_bucket,boto3_session=your_session)

['s3://destination-folder/csv/write_top_earners.csv/6b31a514686f478da14844d9f9a4d293.csv',
 's3://destination-folder/json/9b98665ad1074525b542d1d38c0b325e.json',
 's3://destination-folder/top_earners_list.csv',
 's3://destination-folder/top_earners_list.txt']

In [25]:
# Verify the table is written to the Glue Catalog

wr.catalog.tables(database='loading_file',
                  boto3_session=your_session)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,loading_file,top_learners_csv,,EXTERNAL_TABLE,"__index_level_0__, id, name, e_mail, salary, o...",
1,loading_file,top_learners_json,,EXTERNAL_TABLE,"id, name, e_mail, salary, occupation, __index_...",


#### Writing as parquet file

In [26]:
wr.s3.to_parquet(df=earners_csv,
        path=destination_bucket + '/parquet/',
             dataset=True,database='loading_file',table='top_earner_pqt',
    boto3_session=your_session)

{'paths': ['s3://destination-folder/parquet/cffaea488ab84ecab8d73ebc4f344d10.snappy.parquet'],
 'partitions_values': {}}

In [27]:
# Verify the data is written using list_objects

wr.s3.list_objects(destination_bucket,boto3_session=your_session)

['s3://destination-folder/csv/write_top_earners.csv/6b31a514686f478da14844d9f9a4d293.csv',
 's3://destination-folder/json/9b98665ad1074525b542d1d38c0b325e.json',
 's3://destination-folder/parquet/cffaea488ab84ecab8d73ebc4f344d10.snappy.parquet',
 's3://destination-folder/top_earners_list.csv',
 's3://destination-folder/top_earners_list.txt']

In [28]:
# Verify the table is written to the Glue Catalog

wr.catalog.tables(database='loading_file',
                  boto3_session=your_session)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,loading_file,top_earner_pqt,,EXTERNAL_TABLE,"id, name, e_mail, salary, occupation",
1,loading_file,top_learners_csv,,EXTERNAL_TABLE,"__index_level_0__, id, name, e_mail, salary, o...",
2,loading_file,top_learners_json,,EXTERNAL_TABLE,"id, name, e_mail, salary, occupation, __index_...",


#### Cleaning up the Glue Catalog

In [29]:
wr.catalog.delete_database(name='loading_file',
                           boto3_session=your_session)

#### Cleaning up the files

In [30]:
wr.s3.delete_objects(path=destination_bucket,boto3_session=your_session)

In [31]:
# Verify the data is written using list_objects

wr.s3.list_objects(destination_bucket,boto3_session=your_session)

[]