In [1]:
from datetime import datetime, timedelta
import pandas as pd
import boto3
import awswrangler as wr
import os

In [2]:
s3 = boto3.resource('s3')

In [7]:
def s3_write(df, filename, type:str):
    path = 's3://jacobsbucket97/sample_files/file1.parquet'
    try:
        if type == 'csv':
            wr.s3.to_csv(df=df, path = f's3://jacobsbucket97/sample_files/{filename}.csv')
        elif type == 'json':
            wr.s3.to_json(df=df, path = f's3://jacobsbucket97/sample_files/{filename}.json')
        elif type == 'parquet':
            wr.s3.to_parquet(df=df, path = f's3://jacobsbucket97/sample_files/{filename}.parquet')
        else:
            raise Exception('Please select 1 of csv, json, or parquet for type')
    except BaseException as e:
        print(f"Error occured, {e}")

In [37]:
def s3_read(filename, type:str):
    path = 's3://jacobsbucket97/sample_files/file1.parquet'
    try:
        if type == 'csv':
            df = wr.s3.read_csv(path = f's3://jacobsbucket97/sample_files/{filename}.csv')
        elif type == 'json':
            df = wr.s3.read_json(path = f's3://jacobsbucket97/sample_files/{filename}.json')
        elif type == 'parquet':
            df = wr.s3.read_parquet(path = f's3://jacobsbucket97/sample_files/{filename}.parquet')
        else:
            raise Exception('Please select 1 of csv, json, or parquet for type')
        print(f"Returning {filename}.{type} object with {len(df)} rows ...")
        return df
    except BaseException as e:
        print(f"Error occured, {e}")

In [39]:
bby = s3_read('file1', 'csv')
bby2 = s3_read('file5', 'json')

Returning file1.csv object with 3 rows ...
Returning file5.json object with 3 rows ...


In [8]:
s3_write('parqsfsdf')

Exception: Please select 1 of csv, json, or parquet for type

In [19]:
bucket = s3.Bucket('jacobsbucket97')

In [20]:
def write_to_s3(file_type, df, bucket = os.environ.get('S3_BUCKET')):
    date = datetime.now().date()
    # the date of the data, not the current date
    try:
        wr.s3.to_parquet(
            df = df,
            path = f"s3://{bucket}/{file_type}/{file_type}-{date}.parquet",
            index = False
        )
        print(f"Storing {len(df)} {file_type} rows to S3 (s3://{bucket}/{file_type}/{file_type}-{date})")
        pass
    except BaseException as error:
        print(f"S3 Storage Function Failed for {file_type}, {error}")
        pass

In [25]:
df1 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file1.csv')
df2 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file2.csv')
df3 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file3.csv')
df4 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file4.csv')
df5 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file5.csv')

In [27]:
wr.s3.to_parquet(df=df1, path = 's3://jacobsbucket97/sample_files/file1.parquet')
wr.s3.to_parquet(df=df2, path = 's3://jacobsbucket97/sample_files/file2.parquet')
wr.s3.to_parquet(df=df3, path = 's3://jacobsbucket97/sample_files/file3.parquet')
wr.s3.to_parquet(df=df4, path = 's3://jacobsbucket97/sample_files/file4.parquet')
wr.s3.to_parquet(df=df5, path = 's3://jacobsbucket97/sample_files/file5.parquet')

{'paths': ['s3://jacobsbucket97/sample_files/file5.parquet'],
 'partitions_values': {}}

In [30]:
df1 = wr.s3.read_parquet('s3://jacobsbucket97/sample_files/file1.parquet')
df2 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file2.csv')
df3 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file3.csv')
df4 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file4.csv')
df5 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file5.csv')

In [29]:
wr.s3.to_json(df=df1, path = 's3://jacobsbucket97/sample_files/file1.json')
wr.s3.to_json(df=df2, path = 's3://jacobsbucket97/sample_files/file2.json')
wr.s3.to_json(df=df3, path = 's3://jacobsbucket97/sample_files/file3.json')
wr.s3.to_json(df=df4, path = 's3://jacobsbucket97/sample_files/file4.json')
wr.s3.to_json(df=df5, path = 's3://jacobsbucket97/sample_files/file5.json')

['s3://jacobsbucket97/sample_files/file5.json']

In [None]:
df1 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file1.csv')
df2 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file2.csv')
df3 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file3.csv')
df4 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file4.csv')
df5 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/file5.csv')

In [3]:
df = pd.read_csv("reddit_comments.csv")

In [10]:
wr.s3.to_csv(
    df = df,
    path = "s3://jacobsbucket97/sample_files/reddit_comments_gzip.csv",
    compression = 'gzip'
)

wr.s3.to_parquet(
    df = df,
    path = "s3://jacobsbucket97/sample_files/reddit_comments_snappy.parquet",
    compression = 'snappy'
)

{'paths': ['s3://jacobsbucket97/sample_files/reddit_comments_gzip.csv'],
 'partitions_values': {}}

In [8]:
# raw csv - 4.7 mb
# zip csv - 1.9 mb

# raw parquet - 2.9 mb
# zip parquet - 1.8 mb
# snappy parquet - 2.9mb

df1 = wr.s3.read_parquet('s3://jacobsbucket97/sample_files/reddit_comments_gzip.parquet')
df2 = wr.s3.read_parquet('s3://jacobsbucket97/sample_files/reddit_comments_snappy.parquet')
df3 = wr.s3.read_parquet('s3://jacobsbucket97/sample_files/reddit_comments_snappy.snappy')
df4 = wr.s3.read_parquet('s3://jacobsbucket97/sample_files/reddit_comments.parquet')
df5 = wr.s3.read_csv('s3://jacobsbucket97/sample_files/reddit_comments_gzip.csv', compression = 'gzip')

In [20]:

# 0.8
# df_csv = wr.s3.read_csv('s3://jacobsbucket97/sample_files/reddit_comments.csv')

# df_zipped_csv = wr.s3.read_csv('s3://jacobsbucket97/sample_files/reddit_comments_gzip.csv', compression = 'gzip')

# df_parquet = wr.s3.read_parquet('s3://jacobsbucket97/sample_files/reddit_comments.parquet')

# df_zipped_parquet = wr.s3.read_parquet('s3://jacobsbucket97/sample_files/reddit_comments_gzip.parquet')

In [4]:
df = wr.s3.read_parquet('s3://jacobsbucket97-dev/cost-reports/jyablonski-test-report/20220501-20220601/20220503T013620Z/jyablonski-test-report-00001.snappy.parquet')
df.to_csv('billing_report_2022-05.csv')