In [None]:
!pip install awswrangler geopandas

In [1]:
import awswrangler as wr
import geopandas
from shapely import wkb
# Athena only support ST_GEOMETRY_FROM_TEXT(WKT)
import pandas as pd
import boto3

In [2]:
dbname = 'accommodations-20200923'

tb_zip = 'zipcode'
fp_zip_wkt = 'zipcode_wkt.csv'
tb_zip_wkt = "zipcode_wkt"

tb_acc = 'accommodations'
fp_acc_wkt = 'accommodations_wkt.csv'
tb_acc_wkt = "accommodations_wkt"

# Reading data from Athena

- **ctas_approach=True** (Default)

    Wraps the query with a CTAS and then reads the table data as parquet directly from s3.
    
    * `PROS`:
        - Faster for mid and big result sizes.
        - Can handle some level of nested types.
    * `CONS`:
         - Requires create/delete table permissions on Glue.
         - Does not support timestamp with time zone
         - Does not support columns with repeated names.
         - Does not support columns with undefined data types.
         - A temporary table will be created and then deleted immediately.


- **ctas_approach=False**

    Does a regular query on Athena and parse the regular CSV result on s3.
    
    * `PROS`:
        - Faster for small result sizes (less latency).
        - Does not require create/delete table permissions on Glue
        - Supports timestamp with time zone.
    * `CONS`:
        - Slower (But stills faster than other libraries that uses the regular Athena API)
        - Does not handle nested types at all.

In [None]:
%%time
wr.athena.read_sql_query(f'SELECT * FROM {tb_acc}', database=dbname, ctas_approach=False)

In [None]:
%%time
wr.athena.read_sql_query(f'SELECT * FROM {tb_acc}', database=dbname)

# Geometry Object to WKT format

Fetch geometry object in well known text (WKT) format and additional attributes such as the zip code of this area.

Athena geospatial function [list](https://docs.aws.amazon.com/athena/latest/ug/geospatial-functions-list.html)

In [3]:
df_zip = wr.athena.read_sql_query(f'SELECT * FROM {tb_zip}', database=dbname)
df_zip['geometry']=df_zip.apply(lambda x: wkb.loads(x['wkb_geometry'], hex=True), axis=1)
gdf = geopandas.GeoDataFrame(df_zip)
gdf=gdf.drop(columns='wkb_geometry')
gdf.head()

Unnamed: 0,ogc_field,gml_id,spatial_name,spatial_alias,spatial_type,geometry
0,0,re_postleit.10115,10115,10115,Polygon,"POLYGON ((13.36586 52.53566, 13.36829 52.53330..."
1,4,re_postleit.10117,10117,10117,Polygon,"POLYGON ((13.37663 52.50819, 13.37682 52.50815..."
2,16,re_postleit.10179,10179,10179,Polygon,"POLYGON ((13.40306 52.51217, 13.40261 52.51186..."
3,28,re_postleit.10247,10247,10247,Polygon,"POLYGON ((13.45253 52.51702, 13.45271 52.51700..."
4,36,re_postleit.10315,10315,10315,Polygon,"POLYGON ((13.51385 52.50378, 13.51352 52.50376..."


In [4]:
# save the wkt file
gdf.to_csv(fp_zip_wkt,sep=';',index=False)

In [None]:
# df_zip['geometry']=df_zip.apply(lambda x: wkb.loads(x['wkb_geometry'], hex=True), axis=1)
# gdf = geopandas.GeoDataFrame(df_zip)
# gdf=gdf.drop(columns='wkb_geometry')
# gdf.head()
# !head -n 2 {fp_zipcode}

### parquet, s3, and athena catelog

In [6]:
df_zip = pd.read_csv(fp_zip_wkt, sep=';')
region = boto3.session.Session().region_name
bucket = f'beyoung-{region}-20200823'
prefix = 'geo-spatial-tutorial'
s3_zip_wkt = f's3://{bucket}/{prefix}/zip_wkt/'

In [7]:
desc = "zip code with WKT."
param = {
    "source": "EU-DE",
    "class": "e-commerce"
}

res = wr.s3.to_parquet(
    df=df_zip,
    path=s3_zip_wkt,
    dataset=True,
    database=dbname,
    table=tb_zip_wkt,
    mode="overwrite",
    description=desc,
    parameters=param
)

### verify load data in athena

In [8]:
df_zipwkt = wr.athena.read_sql_query(f'SELECT * FROM {tb_zip_wkt}', database=dbname)
df_zipwkt.head()

Unnamed: 0,ogc_field,gml_id,spatial_name,spatial_alias,spatial_type,geometry
0,0,re_postleit.10115,10115,10115,Polygon,"POLYGON ((13.36585984657949 52.53565958104801,..."
1,4,re_postleit.10117,10117,10117,Polygon,"POLYGON ((13.37663404780158 52.50819300592241,..."
2,16,re_postleit.10179,10179,10179,Polygon,"POLYGON ((13.4030566190892 52.51216766131019, ..."
3,28,re_postleit.10247,10247,10247,Polygon,"POLYGON ((13.45252954534141 52.51702256953737,..."
4,36,re_postleit.10315,10315,10315,Polygon,"POLYGON ((13.51385168756656 52.50377523916661,..."


### accommodations

In [9]:
# df_acc = wr.athena.read_sql_query(f'SELECT * FROM {tb_acc}', database=dbname)

In [10]:
df_acc = wr.athena.read_sql_query(f'SELECT * FROM {tb_acc}', database=dbname)
df_acc['geometry']=df_acc.apply(lambda x: wkb.loads(x['shape'], hex=True), axis=1)
gdf_acc = geopandas.GeoDataFrame(df_acc)
gdf_acc=gdf_acc.drop(columns='shape')
# save the wkt file
gdf_acc.to_csv(fp_acc_wkt,sep=';',index=False)
gdf_acc.head()

Unnamed: 0,id,name,host_name,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,geometry
0,7071,BrightRoom with sunny greenview!,Bright,Pankow,Helmholtzplatz,Private room,42,2,197,2018-11-04,1.75,1,26,POINT (13.41509 52.54316)
1,28268,Cozy Berlin Friedrichshain for1/6 p,Elena,Friedrichshain-Kreuzberg,Frankfurter Allee Sued FK,Entire home/apt,90,5,30,2017-08-02,0.33,1,55,POINT (13.46995 52.51339)
2,42742,Spacious 35m2 in Central Apartment,Desiree,Friedrichshain-Kreuzberg,suedliche Luisenstadt,Private room,36,1,25,2018-10-01,0.32,2,217,POINT (13.44034 52.49667)
3,57792,Bungalow mit Garten in Berlin Zehlendorf,Jo,Steglitz - Zehlendorf,Ostpreußendamm,Entire home/apt,49,2,3,2017-02-12,0.03,1,0,POINT (13.32806 52.42531)
4,81081,Beautiful Prenzlauer Berg Apt,Bernd+Katja :-),Pankow,Prenzlauer Berg Nord,Entire home/apt,66,3,238,2018-10-28,2.59,1,69,POINT (13.41528 52.54968)


In [11]:
df_acc = pd.read_csv(fp_acc_wkt, sep=';')
s3_acc_wkt = f's3://{bucket}/{prefix}/acc_wkt/'

In [12]:
desc = "accommodations code with WKT."
param = {
    "source": "airbnb",
    "class": "e-commerce"
}

res = wr.s3.to_parquet(
    df=df_acc,
    path=s3_acc_wkt,
    dataset=True,
    database=dbname,
    table=tb_acc_wkt,
    mode="overwrite",
    description=desc,
    parameters=param
)

In [13]:
#verify athena
df_acc_wkt = wr.athena.read_sql_query(f'SELECT * FROM {tb_acc_wkt}', database=dbname)
df_acc_wkt.head()

Unnamed: 0,id,name,host_name,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,geometry
0,7071,BrightRoom with sunny greenview!,Bright,Pankow,Helmholtzplatz,Private room,42,2,197,2018-11-04,1.75,1,26,POINT (13.41509110451571 52.5431572633131)
1,28268,Cozy Berlin Friedrichshain for1/6 p,Elena,Friedrichshain-Kreuzberg,Frankfurter Allee Sued FK,Entire home/apt,90,5,30,2017-08-02,0.33,1,55,POINT (13.46994750777965 52.51338523184102)
2,42742,Spacious 35m2 in Central Apartment,Desiree,Friedrichshain-Kreuzberg,suedliche Luisenstadt,Private room,36,1,25,2018-10-01,0.32,2,217,POINT (13.44033972223182 52.4966675178124)
3,57792,Bungalow mit Garten in Berlin Zehlendorf,Jo,Steglitz - Zehlendorf,Ostpreußendamm,Entire home/apt,49,2,3,2017-02-12,0.03,1,0,POINT (13.3280616113794 52.42530982912259)
4,81081,Beautiful Prenzlauer Berg Apt,Bernd+Katja :-),Pankow,Prenzlauer Berg Nord,Entire home/apt,66,3,238,2018-10-28,2.59,1,69,POINT (13.41527791046659 52.54968096201051)


# Find the most exepensive hotel

Get the most expensive accommodation (price=9000) and show me in which zip code it is.

```
SELECT a.price, a.name, a.geometry, z.spatial_name
FROM accommodations_wkt a, zipcode_wkt z 
WHERE price = 9000 AND ST_Within(ST_GEOMETRY_FROM_TEXT(a.geometry), ST_GEOMETRY_FROM_TEXT(z.geometry));
```

ST_WITHIN (geometry, geometry)
Returns TRUE if and only if the left geometry is within the right geometry.



In [16]:
sql='''
SELECT a.price, a.name, a.geometry, z.spatial_name
FROM accommodations_wkt a, zipcode_wkt z 
WHERE price = 9000 AND ST_Within(ST_GEOMETRY_FROM_TEXT(a.geometry), ST_GEOMETRY_FROM_TEXT(z.geometry));
'''

In [17]:
wr.athena.read_sql_query(sql, database=dbname)

Unnamed: 0,price,name,geometry,spatial_name
0,9000,Ueber den Dächern Berlins Zentrum,POINT (13.33443698501297 52.49797795015381),10777


## Converts text into a geometry data type in Athena

ST_GEOMETRY_FROM_TEXT (varchar)
Converts text into a geometry data type. Returns a value in a geometry data type, which is a binary representation of the geometry data type.

In [None]:
# sql='''
# select ogc_field, spatial_name, spatial_type, ST_GEOMETRY_FROM_TEXT(geometry) as geo
# from zipcode_wkt order by spatial_name
# '''
# wr.athena.read_sql_query(sql, database=dbname)