## Register Data with Athena

In [2]:
# import boto3
# import sagemaker

# sess = sagemaker.Session()
# bucket = sess.default_bucket()
# role = sagemaker.get_execution_role()
# region = boto3.Session().region_name


import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [3]:
ingest_create_athena_table_csv_passed = False

In [4]:
%store -r ingest_create_athena_db_passed

In [5]:
try:
    ingest_create_athena_db_passed
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")

In [6]:
print(ingest_create_athena_db_passed)

True


In [7]:
if not ingest_create_athena_db_passed:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")
else:
    print("[OK]")

[OK]


In [8]:
s3_hotels_csv = "s3://aurelia-resort-data/Hotel bookings/"


In [9]:
print(ingest_create_athena_db_passed)

True


In [10]:
from pyathena import connect

In [11]:
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [12]:
database_name = "hotels"
table_name_res = "hotel_res"

In [13]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# Create Schema for Hotel Reservations

In [14]:
# # SQL statement to execute

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
  `Booking_ID` char(100),
  `no_of_adults` int,
  `no_of_children` int,
  `no_of_weekend_nights` int,
  `type_of_meal_plan` char(100),
  `required_car_parking_space` int,
  `room_type_reserved` char(100),
  `lead_time` int, 
  `arrival_year` int,
  `arrival_month` int,
  `arrival_date` int, 
  `market_segment_type` char(100),
  `repeated_guest` int, 
  `no_of_previous_cancellations` int, 
  `no_of_previous_bookings_not_canceled` int,
  `avg_price_per_room` float,
  `no_of_special_request` int, 
  `booking_status` char(100)
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('field.delim' = ',')
LOCATION '{}'
TBLPROPERTIES ('classification' = 'csv');""".format(
    database_name, table_name_res, s3_hotels_csv
)
print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS hotels.hotel_res (
  `Booking_ID` char(100),
  `no_of_adults` int,
  `no_of_children` int,
  `no_of_weekend_nights` int,
  `type_of_meal_plan` char(100),
  `required_car_parking_space` int,
  `room_type_reserved` char(100),
  `lead_time` int, 
  `arrival_year` int,
  `arrival_month` int,
  `arrival_date` int, 
  `market_segment_type` char(100),
  `repeated_guest` int, 
  `no_of_previous_cancellations` int, 
  `no_of_previous_bookings_not_canceled` int,
  `avg_price_per_room` float,
  `no_of_special_request` int, 
  `booking_status` char(100)
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('field.delim' = ',')
LOCATION 's3://aurelia-resort-data/Hotel bookings/'
TBLPROPERTIES ('classification' = 'csv');


In [15]:
import pandas as pd

pd.read_sql(statement, conn)

In [16]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(10)

Unnamed: 0,tab_name
0,bookings
1,hotel_bookings
2,hotel_res


In [17]:
if table_name_res in df_show.values:
    ingest_create_athena_table_csv_passed = True

In [18]:
%store ingest_create_athena_table_csv_passed

Stored 'ingest_create_athena_table_csv_passed' (bool)


In [19]:
statement = """SELECT * FROM {}.{}
    LIMIT 10""".format(
    database_name, table_name_res
)

print(statement)

SELECT * FROM hotels.hotel_res
    LIMIT 10


In [20]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,booking_id,no_of_adults,no_of_children,no_of_weekend_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_request,booking_status
0,Booking_ID ...,,,,no_of_week_nights ...,,required_car_parking_space ...,,,,,arrival_date ...,,,,,,no_of_special_requests ...
1,INN00001 ...,2.0,0.0,1.0,2 ...,,0 ...,,224.0,2017.0,10.0,2 ...,,0.0,0.0,0.0,65.0,0 ...
2,INN00002 ...,2.0,0.0,2.0,3 ...,,0 ...,,5.0,2018.0,11.0,6 ...,,0.0,0.0,0.0,106.0,1 ...
3,INN00003 ...,1.0,0.0,2.0,1 ...,,0 ...,,1.0,2018.0,2.0,28 ...,,0.0,0.0,0.0,60.0,0 ...
4,INN00004 ...,2.0,0.0,0.0,2 ...,,0 ...,,211.0,2018.0,5.0,20 ...,,0.0,0.0,0.0,100.0,0 ...


## Release Resources

In [21]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [22]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>