In [117]:
import boto3
import sagemaker

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [118]:
ingest_create_athena_table_tsv_passed = False

In [119]:
%store -r ingest_create_athena_db_passed

In [120]:
try:
    ingest_create_athena_db_passed
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")

In [121]:
print(ingest_create_athena_db_passed)

True


In [122]:
if not ingest_create_athena_db_passed:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")
else:
    print("[OK]")

[OK]


In [144]:
s3_reviews_csv = "s3://aurelia-resort-data/airbnb/reviews"

In [145]:
from pyathena import connect

In [146]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [147]:
# Set Athena parameters
database_name = "travel_airbnb"
table_name_csv = "reviews_csv"

In [148]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [240]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
         listing_id string,
         id string,
         date string,
         reviewer_id string,
         reviewer_name string,
         comments string
) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   'separatorChar' = ',',
   'quoteChar' = '\"',
   'escapeChar' = '\\\\'
   )
STORED AS TEXTFILE
LOCATION '{}'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1');""".format(
    database_name, table_name_csv, s3_reviews_csv
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS travel_airbnb.reviews_csv (
         listing_id string,
         id string,
         date string,
         reviewer_id string,
         reviewer_name string,
         comments string
) 
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
   'separatorChar' = ',',
   'quoteChar' = '"',
   'escapeChar' = '\\'
   )
STORED AS TEXTFILE
LOCATION 's3://aurelia-resort-data/airbnb/reviews'
TBLPROPERTIES ('compressionType'='gzip', 'skip.header.line.count'='1');


In [241]:
import pandas as pd

pd.read_sql(statement, conn)

In [242]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,tab_name
0,reviews_csv


In [243]:
if table_name_tsv in df_show.values:
    ingest_create_athena_table_tsv_passed = True

In [244]:
%store ingest_create_athena_table_tsv_passed

Stored 'ingest_create_athena_table_tsv_passed' (bool)


In [245]:
statement = """SELECT * FROM {}.{}
            LIMIT 100;""".format(
    database_name, table_name_csv
)

print(statement)

SELECT * FROM travel_airbnb.reviews_csv
            LIMIT 100;


In [248]:
df = pd.read_sql(statement, conn)
df.head(20)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,28717282,611311066,2020-02-26,144251887,Danny,"This was our last stop in our US road trip, ha..."
1,28717282,612497481,2020-02-29,330825082,William,10/10 would stay here again!
2,28717282,613713908,2020-03-02,148439563,Loy,Very nice
3,28717282,615207226,2020-03-07,300366116,Sarah,Apartment is easily accessed from many histori...
4,28717282,616756958,2020-03-10,66650026,Nadia,Jamie's place was excellent! It was incredibly...
5,28717282,626646192,2020-05-29,147449285,Claire,"Very beautiful and calm!! Home away from away,..."
6,28717282,627827073,2020-06-05,147449285,Claire,Totally in love with Jamie’s cozy spot! Clean ...
7,28717282,629048190,2020-06-12,147449285,Claire,"Quiet, comfortable , cozy spot in the heart of..."
8,28717282,707862242,2020-11-09,229882831,Markos,Jamie and Robert were both exceptional hosts. ...
9,28717282,708493969,2020-11-13,315689686,Julius,"Awesome place to stay, super easy!"


In [249]:
if not df.empty:
    print("[OK]")
else:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOUR DATA HAS NOT BEEN REGISTERED WITH ATHENA. LOOK IN PREVIOUS CELLS TO FIND THE ISSUE.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++")

[OK]


In [251]:
%store

Stored variables and their in-db values:
ingest_create_athena_db_passed                    -> True
ingest_create_athena_table_tsv_passed             -> False


In [252]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}