In [1]:
import awswrangler as wr

In [2]:
bucket = 'my-bucket-556-115-65'
path = f"s3://{bucket}/data/"

In [3]:
if "awswrangler_test" not in wr.catalog.databases().values:
    wr.catalog.create_database("awswrangler_test")

In [4]:
cols = ["id", "dt", "element", "value", "m_flag", "q_flag", "s_flag", "obs_time"]

df = wr.s3.read_csv(
    path="s3://noaa-ghcn-pds/csv/189",
    names=cols,
    parse_dates=["dt", "obs_time"])  # Read 10 files from the 1890 decade (~1GB)

df.head()

Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time
0,AGE00135039,1890-01-01,TMAX,160,,,E,
1,AGE00135039,1890-01-01,TMIN,30,,,E,
2,AGE00135039,1890-01-01,PRCP,45,,,E,
3,AGE00147705,1890-01-01,TMAX,140,,,E,
4,AGE00147705,1890-01-01,TMIN,74,,,E,


In [5]:
wr.s3.to_parquet(
    df=df,
    path=path,
    dataset=True,
    mode="overwrite",
    database="awswrangler_test",
    table="noaa"
);

In [6]:
wr.catalog.table(database="awswrangler_test", table="noaa")

Unnamed: 0,Column Name,Type,Partition,Comment
0,id,string,False,
1,dt,timestamp,False,
2,element,string,False,
3,value,bigint,False,
4,m_flag,string,False,
5,q_flag,string,False,
6,s_flag,string,False,
7,obs_time,string,False,


In [8]:
%%time
# Reading with ctas_approach=False
wr.athena.read_sql_query("SELECT * FROM noaa", database="awswrangler_test", ctas_approach=False)

CPU times: user 1min 36s, sys: 5.49 s, total: 1min 41s
Wall time: 3min 53s


Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time
0,AGE00135039,1890-01-01,TMAX,160,,,E,
1,AGE00135039,1890-01-01,TMIN,30,,,E,
2,AGE00135039,1890-01-01,PRCP,45,,,E,
3,AGE00147705,1890-01-01,TMAX,140,,,E,
4,AGE00147705,1890-01-01,TMIN,74,,,E,
...,...,...,...,...,...,...,...,...
29567911,UZM00038457,1899-12-31,PRCP,16,,,r,
29567912,UZM00038457,1899-12-31,TAVG,-73,,,r,
29567913,UZM00038618,1899-12-31,TMIN,-76,,,r,
29567914,UZM00038618,1899-12-31,PRCP,0,,,r,


In [9]:
%%time
# Default with ctas_approach=True - 13x faster (default)
wr.athena.read_sql_query("SELECT * FROM noaa", database="awswrangler_test")

CPU times: user 47.2 s, sys: 11.4 s, total: 58.6 s
Wall time: 56.1 s


Unnamed: 0,id,dt,element,value,m_flag,q_flag,s_flag,obs_time
0,ASN00053001,1890-01-06,PRCP,0,,,a,
1,ASN00053005,1890-01-06,PRCP,0,,,a,
2,ASN00053009,1890-01-06,PRCP,18,,,a,
3,ASN00053010,1890-01-06,PRCP,0,,,a,
4,ASN00053027,1890-01-06,PRCP,0,,,a,
...,...,...,...,...,...,...,...,...
29567911,USC00450569,1899-12-31,SNOW,0,,,6,
29567912,USC00450872,1899-12-31,TMAX,78,,,6,
29567913,USC00450872,1899-12-31,TMIN,44,,,6,
29567914,USC00450872,1899-12-31,PRCP,25,,,6,


In [None]:
%%time
wr.athena.read_sql_query("SELECT * FROM noaa", 
                         database="awswrangler_test", 
                         ctas_approach=False, unload_approach=True, 
                         s3_output=f"s3://{bucket}/unload/"
)

### Batching (Good for restricted memory environments)

In [None]:
%%time

dfs = wr.athena.read_sql_query(
    "SELECT * FROM noaa",
    database="awswrangler_test",
    chunksize=True  # Chunksize calculated automatically for ctas_approach.
)

for df in dfs:  # Batching
    print(len(df.index))

In [None]:
%%time

dfs = wr.athena.read_sql_query(
    "SELECT * FROM noaa",
    database="awswrangler_test",
    chunksize=100_000_000
)

for df in dfs:  # Batching
    print(len(df.index))