### Reading the files into AWS Wrangler and writing to S3

In [1]:
#import below libraries
import awswrangler as wr
import pandas as pd
import boto3
import warnings 
warnings.filterwarnings('ignore')
import configparser

In [2]:
#reading the credentials securely.
credents = configparser.ConfigParser()

In [3]:
#use read_file method
credents.read_file(open('credentials.config'))

In [4]:
#Reading in the credentials into Python variables. No can see them
aws_key = credents["AWS"]["KEY"]
aws_secret = credents["AWS"]["SECRET"]
region = credents["AWS"]["REGION"]

In [5]:
#Creating the Session
your_session = boto3.Session(aws_access_key_id=aws_key,
                            aws_secret_access_key=aws_secret,
                            region_name=region)

### Since we are reading from S3. The bucket will be source

In [6]:
source_bucket = "s3://destination-folder"

In [7]:
wr.s3.list_directories(path=source_bucket,boto3_session=your_session)

[]

In [37]:
csv_folder = "/csv"
excel_folder = "/excel"
parquet_folder = "/parquet"
json_folder = '/json'

In [39]:
#Always build the path for making it easy for programming
csv_path = source_bucket + csv_folder
csv_path

's3://destination-folder/csv'

### Reading CSV file inside S3

wr.s3.read_csv( path: Union[str, List[str]],
    
    path_suffix: Union[str, List[str], NoneType] = None,
    
    path_ignore_suffix: Union[str, List[str], NoneType] = None,
    
    boto3_session: Optional[boto3.session.Session] = None,
    
    chunksize: Optional[int] = None,
    
    dataset: bool = False) 

In [40]:
read_wr_csv = wr.s3.read_csv(path=csv_path,
                             boto3_session=your_session,)

In [41]:
read_wr_csv

Unnamed: 0.1,Unnamed: 0,id,name,E-mail,Salary,occupation
0,0,1,Joel,Joel@Athena.com,187069,Mathematician
1,1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,2,3,Beatles,beatles@lambda.com,975682,Algorist
3,3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


In [42]:
type(read_wr_csv)

pandas.core.frame.DataFrame

In [43]:
read_wr_csv[read_wr_csv.Salary > 900000]

Unnamed: 0.1,Unnamed: 0,id,name,E-mail,Salary,occupation
2,2,3,Beatles,beatles@lambda.com,975682,Algorist


### Commonly used params in wr.s3.to_csv()
wr.s3.to_csv(df: pandas.core.frame.DataFrame,
    
    path: Optional[str] = None,
    
    index: bool = True,
    
    boto3_session: Optional[boto3.session.Session] = None,
    
    dataset: bool = False,
    
    filename_prefix: Optional[str] = None,
    
    partition_cols: Optional[List[str]] = None,
    
    bucketing_info: Optional[Tuple[List[str], int]] = None,
    
    database: Optional[str] = None,
    
    table: Optional[str] = None)

#### Reading xls file : Need to give full path

In [44]:
#Always build the path for making it easy for programming
excel_path = source_bucket + excel_folder
excel_path

's3://destination-folder/excel'

### You have to provide the full path for the XLS files with Prefix

In [45]:
read_excel = wr.s3.read_excel(path=excel_path + '/write_top_earners.xls',
                              boto3_session=your_session)
read_excel

Unnamed: 0.1,Unnamed: 0,id,name,E-mail,Salary,occupation
0,0,1,Joel,Joel@Athena.com,187069,Mathematician
1,1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,2,3,Beatles,beatles@lambda.com,975682,Algorist
3,3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


#### reading parquet file

In [46]:
source = source_bucket+parquet_folder
source

's3://destination-folder/parquet'

In [47]:
wr.s3.list_objects(path=source,boto3_session=your_session)

['s3://destination-folder/parquet/db1c4a0fa56c4db9b9d6044cd94cd5e1.snappy.parquet',
 's3://destination-folder/parquet_exercise/7530e940483a46ce8ca246de57a3291c.snappy.parquet']

In [49]:
read_parquet = wr.s3.read_parquet(path=source+"/db1c4a0fa56c4db9b9d6044cd94cd5e1.snappy.parquet",
                              boto3_session=your_session)
read_parquet

Exception ignored in: <function _S3ObjectBase.__del__ at 0x7f88a1e1b910>
Traceback (most recent call last):
  File "/home/solverbot/.local/lib/python3.10/site-packages/awswrangler/s3/_fs.py", line 243, in __del__
    self.close()
  File "/home/solverbot/.local/lib/python3.10/site-packages/awswrangler/s3/_fs.py", line 474, in close
    _utils.try_it(
  File "/home/solverbot/.local/lib/python3.10/site-packages/awswrangler/_utils.py", line 348, in try_it
    return f(**kwargs)
  File "/home/solverbot/.local/lib/python3.10/site-packages/botocore/client.py", line 507, in _api_call
    return self._make_api_call(operation_name, kwargs)
  File "/home/solverbot/.local/lib/python3.10/site-packages/botocore/client.py", line 902, in _make_api_call
    request_dict = self._convert_to_request_dict(
  File "/home/solverbot/.local/lib/python3.10/site-packages/botocore/client.py", line 973, in _convert_to_request_dict
    request_dict = self._serializer.serialize_to_request(
  File "/home/solverbot/.l

Unnamed: 0,id,name,E-mail,Salary,occupation
0,1,Joel,Joel@Athena.com,187069,Mathematician
1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,3,Beatles,beatles@lambda.com,975682,Algorist
3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


#### reading json file

In [50]:
source = source_bucket+ json_folder
source

's3://destination-folder/json'

In [51]:
json_read = wr.s3.read_json(path= source,
    boto3_session=your_session)

In [52]:
json_read

Unnamed: 0,id,name,E-mail,Salary,occupation
0,1,Joel,Joel@Athena.com,187069,Mathematician
1,2,Afro,Afro@Glue.aws.in,752689,Physicist
2,3,Beatles,beatles@lambda.com,975682,Algorist
3,4,Snoop Dog,snoopy@apigateway.com,752689,Artificial Rapper


#### reading fwf file

In [9]:
source = source_bucket
source

's3://destination-folder'

In [53]:
!ls source_folder/

earners_jets.csv      top_earners_list.txt
top_earners_list.csv  top_earners_list.xlsx


In [54]:
!cat source_folder/top_earners_list.txt

id	name	E-mail	Salary	occupation
1	Joel	Joel@Athena.com	187069	Mathematician
2	Afro	Afro@Glue.aws.in	752689	Physicist
3	Beatles	beatles@lambda.com	975682	Algorist
4	Snoop Dog	snoopy@apigateway.com	752689	Artificial Rapper


In [12]:
wr.s3.upload(local_file='source_folder/top_earners_list.txt',path=source+'/top_earners_list.txt',
             boto3_session=your_session,)

In [31]:
pd.read_fwf('source_folder/top_earners_list.txt',infer_nrows=True,colspecs='infer',)

Unnamed: 0,id,name,E-mail,Salary,occupation
0,1,oel,oel@At,ena.co,187069\tMa
1,2,fro,fro@Gl,e.aws.,n\t752689\tP
2,3,eatl,s\tbeat,es@lam,da.com\t975
3,4,noop,Dog\tsn,opy@ap,gateway.co


In [27]:
fwf_read = wr.s3.read_fwf(path= source +'/top_earners_list.txt',
                          boto3_session=your_session,ignore_empty=True,)

In [28]:
fwf_read

Unnamed: 0,id\tname\tE-mail\tSalary\toccupation,Unnamed: 1
0,1\tJoel\tJoel@Athena.com\t187069\tMathematician,
1,2\tAfro\tAfro@Glue.aws.in\t752689\tPhysicist,
2,3\tBeatles\tbeatles@lambda.com\t975682\tAlgorist,
3,4\tSnoop Dog\tsnoopy@apigateway.com\t752689\tA...,Rapper


In [36]:
wr.s3.download(path=source+"/top_earners_list.txt",boto3_session=your_session,
              local_file="local_destination/top_earners_download.txt")

#### Cleaning up

In [60]:
wr.s3.delete_objects(path=source_bucket,boto3_session=your_session)

In [62]:
# Verify the data is written using list_objects

wr.s3.list_objects(source_bucket,boto3_session=your_session)

[]