In [None]:
!pip install awswrangler

In [1]:
import awswrangler as wr

In [14]:
import geopandas
from shapely import wkb

In [35]:
# Athena only support ST_GEOMETRY_FROM_TEXT(WKT)
import pandas as pd
import boto3

In [3]:
dbname = 'accommodations-20200923'
tb_acc = 'accommodations'
tb_zip = 'zipcode'

# Reading data from Athena

- **ctas_approach=True** (Default)

    Wraps the query with a CTAS and then reads the table data as parquet directly from s3.
    
    * `PROS`:
        - Faster for mid and big result sizes.
        - Can handle some level of nested types.
    * `CONS`:
         - Requires create/delete table permissions on Glue.
         - Does not support timestamp with time zone
         - Does not support columns with repeated names.
         - Does not support columns with undefined data types.
         - A temporary table will be created and then deleted immediately.


- **ctas_approach=False**

    Does a regular query on Athena and parse the regular CSV result on s3.
    
    * `PROS`:
        - Faster for small result sizes (less latency).
        - Does not require create/delete table permissions on Glue
        - Supports timestamp with time zone.
    * `CONS`:
        - Slower (But stills faster than other libraries that uses the regular Athena API)
        - Does not handle nested types at all.

In [4]:
%%time
wr.athena.read_sql_query(f'SELECT * FROM {tb_acc}', database=dbname, ctas_approach=False)

CPU times: user 863 ms, sys: 45.8 ms, total: 909 ms
Wall time: 5.06 s


Unnamed: 0,id,shape,name,host_name,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,7071,0101000020E6100000CE823DD286D42A40DF415D2D8645...,BrightRoom with sunny greenview!,Bright,Pankow,Helmholtzplatz,Private room,42,2,197,2018-11-04,1.75,1,26
1,28268,0101000020E61000002780B1F59CF02A40D780769BB641...,Cozy Berlin Friedrichshain for1/6 p,Elena,Friedrichshain-Kreuzberg,Frankfurter Allee Sued FK,Entire home/apt,90,5,30,2017-08-02,0.33,1,55
2,42742,0101000020E6100000163B443574E12A40B0FE1CCD923F...,Spacious 35m2 in Central Apartment,Desiree,Friedrichshain-Kreuzberg,suedliche Luisenstadt,Private room,36,1,25,2018-10-01,0.32,2,217
3,57792,0101000020E610000029E507B1F7A72A40DB5F6F8D7036...,Bungalow mit Garten in Berlin Zehlendorf,Jo,Steglitz - Zehlendorf,Ostpreußendamm,Entire home/apt,49,2,3,2017-02-12,0.03,1,0
4,81081,0101000020E61000001269684E9FD42A40D3881DF25B46...,Beautiful Prenzlauer Berg Apt,Bernd+Katja :-),Pankow,Prenzlauer Berg Nord,Entire home/apt,66,3,238,2018-10-28,2.59,1,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22243,29800049,0101000020E610000083A934C316F12A408DF6974F8C41...,Cozy 1-room-apartment near Boxhagener Platz,Anna,Friedrichshain-Kreuzberg,Frankfurter Allee Sued FK,Entire home/apt,60,3,0,,,2,4
22244,29814822,0101000020E6100000B011F546BAB22A40B58908F34B3F...,helle ruhige und zentrale Wohnung mit 2 Balkone,Maria,Tempelhof - Schoeneberg,Schoeneberg-Nord,Entire home/apt,80,2,0,,,2,90
22245,29827304,0101000020E61000006B74707951E42A40E348C162B735...,Berlin Multi-unit building,Anisa,Neukoelln,Buckow,Private room,60,1,0,,,1,72
22246,29848903,0101000020E61000003FA1F905FDDF2A40A3AB439C8244...,Sonnige Familienwohnung in Prenzlauer Berg,Juliane,Pankow,Prenzlauer Berg Ost,Entire home/apt,30,21,0,,,1,31


In [5]:
%%time
wr.athena.read_sql_query(f'SELECT * FROM {tb_acc}', database=dbname)

CPU times: user 833 ms, sys: 68.4 ms, total: 901 ms
Wall time: 8.07 s


Unnamed: 0,id,shape,name,host_name,neighbourhood_group,neighbourhood,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,7071,0101000020E6100000CE823DD286D42A40DF415D2D8645...,BrightRoom with sunny greenview!,Bright,Pankow,Helmholtzplatz,Private room,42,2,197,2018-11-04,1.75,1,26
1,28268,0101000020E61000002780B1F59CF02A40D780769BB641...,Cozy Berlin Friedrichshain for1/6 p,Elena,Friedrichshain-Kreuzberg,Frankfurter Allee Sued FK,Entire home/apt,90,5,30,2017-08-02,0.33,1,55
2,42742,0101000020E6100000163B443574E12A40B0FE1CCD923F...,Spacious 35m2 in Central Apartment,Desiree,Friedrichshain-Kreuzberg,suedliche Luisenstadt,Private room,36,1,25,2018-10-01,0.32,2,217
3,57792,0101000020E610000029E507B1F7A72A40DB5F6F8D7036...,Bungalow mit Garten in Berlin Zehlendorf,Jo,Steglitz - Zehlendorf,Ostpreußendamm,Entire home/apt,49,2,3,2017-02-12,0.03,1,0
4,81081,0101000020E61000001269684E9FD42A40D3881DF25B46...,Beautiful Prenzlauer Berg Apt,Bernd+Katja :-),Pankow,Prenzlauer Berg Nord,Entire home/apt,66,3,238,2018-10-28,2.59,1,69
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,4543640,0101000020E6100000C8EF8909B9D62A405E88A9487240...,Few months in Kreuzberg.,Stefano,Friedrichshain-Kreuzberg,noerdliche Luisenstadt,Entire home/apt,35,91,7,2018-04-29,0.15,1,5
1020,4570231,0101000020E61000006919089667D02A40014F51C59648...,Zimmer mit Balkon mit gruenem Ausblick.,Sandra,Pankow,Pankow Zentrum,Private room,40,3,3,2018-05-27,0.47,2,220
1021,4602829,0101000020E61000006E6FFA170DAA2A40664951F5953F...,West Berlin Apartment A 212,Jurij,Charlottenburg-Wilm.,Duesseldorfer Straße,Entire home/apt,31,90,1,2015-03-30,0.02,5,251
1022,4643340,0101000020E6100000A4E74CDC95C72A4080D86E7DAC43...,Heart of Berlin Mitte • Brandenburger Gate • 6 P,Andrea,Mitte,Brunnenstr. Sued,Entire home/apt,70,1,170,2018-11-03,4.91,4,227


# Geometry Object to WKT format

Fetch geometry object in well known text (WKT) format and additional attributes such as the zip code of this area.

Athena geospatial function [list](https://docs.aws.amazon.com/athena/latest/ug/geospatial-functions-list.html)

In [11]:
df_zip = wr.athena.read_sql_query(f'SELECT * FROM {tb_zip}', database=dbname)
df_zip.head()

Unnamed: 0,ogc_field,wkb_geometry,gml_id,spatial_name,spatial_alias,spatial_type
0,0,0103000020E6100000010000004B010000FCF457FB51BB...,re_postleit.10115,10115,10115,Polygon
1,4,0103000020E6100000010000004B0100005CBC8B2DD6C0...,re_postleit.10117,10117,10117,Polygon
2,16,0103000020E6100000010000000A01000051D9EA6F5DCE...,re_postleit.10179,10179,10179,Polygon
3,28,0103000020E610000001000000BD000000226EDBF3B1E7...,re_postleit.10247,10247,10247,Polygon
4,36,0103000020E61000000100000070010000812F82911707...,re_postleit.10315,10315,10315,Polygon


In [16]:
df_zip['geometry']=df_zip.apply(lambda x: wkb.loads(x['wkb_geometry'], hex=True), axis=1)

In [17]:
gdf = geopandas.GeoDataFrame(df_zip)
gdf.head()

Unnamed: 0,ogc_field,wkb_geometry,gml_id,spatial_name,spatial_alias,spatial_type,geometry
0,0,0103000020E6100000010000004B010000FCF457FB51BB...,re_postleit.10115,10115,10115,Polygon,"POLYGON ((13.36586 52.53566, 13.36829 52.53330..."
1,4,0103000020E6100000010000004B0100005CBC8B2DD6C0...,re_postleit.10117,10117,10117,Polygon,"POLYGON ((13.37663 52.50819, 13.37682 52.50815..."
2,16,0103000020E6100000010000000A01000051D9EA6F5DCE...,re_postleit.10179,10179,10179,Polygon,"POLYGON ((13.40306 52.51217, 13.40261 52.51186..."
3,28,0103000020E610000001000000BD000000226EDBF3B1E7...,re_postleit.10247,10247,10247,Polygon,"POLYGON ((13.45253 52.51702, 13.45271 52.51700..."
4,36,0103000020E61000000100000070010000812F82911707...,re_postleit.10315,10315,10315,Polygon,"POLYGON ((13.51385 52.50378, 13.51352 52.50376..."


In [24]:
gdf=gdf.drop(columns='wkb_geometry')
gdf.head()

Unnamed: 0,ogc_field,gml_id,spatial_name,spatial_alias,spatial_type,geometry
0,0,re_postleit.10115,10115,10115,Polygon,"POLYGON ((13.36586 52.53566, 13.36829 52.53330..."
1,4,re_postleit.10117,10117,10117,Polygon,"POLYGON ((13.37663 52.50819, 13.37682 52.50815..."
2,16,re_postleit.10179,10179,10179,Polygon,"POLYGON ((13.40306 52.51217, 13.40261 52.51186..."
3,28,re_postleit.10247,10247,10247,Polygon,"POLYGON ((13.45253 52.51702, 13.45271 52.51700..."
4,36,re_postleit.10315,10315,10315,Polygon,"POLYGON ((13.51385 52.50378, 13.51352 52.50376..."


In [28]:
fp_zipcode='zipcode_wkt.csv'
gdf.to_csv(fp_zipcode,sep=';',index=False)

In [None]:
!head -n 2 {fp_zipcode}

In [36]:
df_zip = pd.read_csv(fp_zipcode, sep=';')
region = boto3.session.Session().region_name
bucket = f'beyoung-{region}-20200823'
prefix = 'geo-spatial-tutorial'
fp_glue_db_zip = f's3://{bucket}/{prefix}/zip_wkt/'
dbname = 'accommodations-20200923'

In [37]:
desc = "zip code with WKT."
param = {
    "source": "EU-DE",
    "class": "e-commerce"
}

res = wr.s3.to_parquet(
    df=df_zip,
    path=fp_glue_db_zip,
    dataset=True,
    database=dbname,
    table="zipcode_wkt",
    mode="overwrite",
    description=desc,
    parameters=param
)

In [38]:
df_zipwkt = wr.athena.read_sql_query(f'SELECT * FROM zipcode_wkt', database=dbname)
df_zipwkt.head()

Unnamed: 0,ogc_field,gml_id,spatial_name,spatial_alias,spatial_type,geometry
0,0,re_postleit.10115,10115,10115,Polygon,"POLYGON ((13.36585984657949 52.53565958104801,..."
1,4,re_postleit.10117,10117,10117,Polygon,"POLYGON ((13.37663404780158 52.50819300592241,..."
2,16,re_postleit.10179,10179,10179,Polygon,"POLYGON ((13.4030566190892 52.51216766131019, ..."
3,28,re_postleit.10247,10247,10247,Polygon,"POLYGON ((13.45252954534141 52.51702256953737,..."
4,36,re_postleit.10315,10315,10315,Polygon,"POLYGON ((13.51385168756656 52.50377523916661,..."


## Converts text into a geometry data type

ST_GEOMETRY_FROM_TEXT (varchar)
Converts text into a geometry data type. Returns a value in a geometry data type, which is a binary representation of the geometry data type.

In [40]:
sql='''
select ogc_field, spatial_name, spatial_type, ST_GEOMETRY_FROM_TEXT(geometry) as geo
from zipcode_wkt order by spatial_name
'''
wr.athena.read_sql_query(sql, database=dbname)

Unnamed: 0,ogc_field,spatial_name,spatial_type,geo
0,752,14197,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00\x10\xcc...
1,756,14199,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00;\xe6FX\...
0,548,13159,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00\xcf\xad...
1,552,13187,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00\x86\xa5...
2,556,13189,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00{\x12\x1...
...,...,...,...,...
17,212,10963,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00\xb1\xa5...
18,216,10965,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00-\x8c\xb...
19,220,10967,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00\xfc\xd1...
20,224,10969,Polygon,b'\x00\x00\x00\x00\x03\x05\x00\x00\x00\xc4G\xe...


# Find the most exepensive hotel

Get the most expensive accommodation (price=9000) and show me in which zip code it is.

```
SELECT 
  a.price, a.name, ST_AsText(a.shape), 
  z.spatial_name, ST_AsText(z.wkb_geometry) 
FROM accommodations a, zipcode z 
WHERE price = 9000 AND ST_Within(a.shape, z.wkb_geometry);
```

ST_WITHIN (geometry, geometry)
Returns TRUE if and only if the left geometry is within the right geometry.

