## Register Data with Athena

In [2]:
# import boto3
# import sagemaker

# sess = sagemaker.Session()
# bucket = sess.default_bucket()
# role = sagemaker.get_execution_role()
# region = boto3.Session().region_name


import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

sm = boto3.Session().client(service_name="sagemaker", region_name=region)

In [3]:
ingest_create_athena_table_csv_passed = False

In [4]:
%store -r ingest_create_athena_db_passed

In [5]:
try:
    ingest_create_athena_db_passed
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")

In [6]:
print(ingest_create_athena_db_passed)

True


In [7]:
if not ingest_create_athena_db_passed:
    print("++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] YOU HAVE TO RUN ALL PREVIOUS NOTEBOOKS.  You did not create the Athena Database.")
    print("++++++++++++++++++++++++++++++++++++++++++++++")
else:
    print("[OK]")

[OK]


In [8]:
s3_hotels_csv = "s3://projectbucketvsbr22/hotel_other/"


In [9]:
print(ingest_create_athena_db_passed)

True


In [10]:
from pyathena import connect

In [11]:
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [12]:
database_name = "hotels"
table_name_booking = "bookings"

In [13]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

## Create Schema for Hotel Bookings


In [14]:

statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{} (
  `hotel` char(50),
  `is_canceled` int,
  `lead_time` int,
  `arrival_date_year` char(50),
  `arrival_date_month` char(50),
  `arrival_date_week_number` int,
  `arrival_date_day_of_month` int, 
  `stays_in_weekend_nights` int,
  `stays_in_week_nights` int,
  `adults` int,
  `children` int,
  `babies` int,
  `meal` char(10),
  `country` char(10),
  `market_segment` char(15),
  `distribution_channel` char(10),
  `is_repeated_guest` int,
  `previous_cancellations` int,
  `previous_bookings_not_canceled` int,
  `reserved_room_type` char(5),
  `assigned_room_type` char(5),
  `booking_changes` int, 
  `deposit_type` char(10),
  `agent` int, 
  `company` int, 
  `days_in_waiting_list` int, 
  `customer_type` char(15),
  `adr` float,
  `required_parking_spaces` int, 
  `total_of_special_request` int, 
  `reservation_status` char(10), 
  `reservation_status_date` date
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES ('field.delim' = ',')
LOCATION '{}'
TBLPROPERTIES ('classification' = 'csv');""".format(
    database_name, table_name_booking, s3_hotels_csv
)
print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS hotels.bookings (
  `hotel` char(50),
  `is_canceled` int,
  `lead_time` int,
  `arrival_date_year` char(50),
  `arrival_date_month` char(50),
  `arrival_date_week_number` int,
  `arrival_date_day_of_month` int, 
  `stays_in_weekend_nights` int,
  `stays_in_week_nights` int,
  `adults` int,
  `children` int,
  `babies` int,
  `meal` char(10),
  `country` char(10),
  `market_segment` char(15),
  `distribution_channel` char(10),
  `is_repeated_guest` int,
  `previous_cancellations` int,
  `previous_bookings_not_canceled` int,
  `reserved_room_type` char(5),
  `assigned_room_type` char(5),
  `booking_changes` int, 
  `deposit_type` char(10),
  `agent` int, 
  `company` int, 
  `days_in_waiting_list` int, 
  `customer_type` char(15),
  `adr` float,
  `required_parking_spaces` int, 
  `total_of_special_request` int, 
  `reservation_status` char(10), 
  `reservation_status_date` date
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
W

In [15]:
import pandas as pd

pd.read_sql(statement, conn)

In [16]:
statement = "SHOW TABLES in {}".format(database_name)

df_show = pd.read_sql(statement, conn)
df_show.head(10)

Unnamed: 0,tab_name
0,bookings
1,hotel_bookings
2,hotel_res


In [17]:
statement = """SELECT * FROM {}.{}
    LIMIT 10""".format(
    database_name, table_name_booking
)

print(statement)

SELECT * FROM hotels.bookings
    LIMIT 10


In [18]:
df = pd.read_sql(statement, conn)
df.head(5)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_parking_spaces,total_of_special_request,reservation_status,reservation_status_date
0,hotel ...,,,arrival_date_year ...,arrival_date_month ...,,,,,,...,deposit_ty,,,,customer_type,,,,reservatio,
1,Resort Hotel ...,0.0,342.0,2015 ...,July ...,27.0,1.0,0.0,0.0,2.0,...,No Deposit,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
2,Resort Hotel ...,0.0,737.0,2015 ...,July ...,27.0,1.0,0.0,0.0,2.0,...,No Deposit,,,0.0,Transient,0.0,0.0,0.0,Check-Out,2015-07-01
3,Resort Hotel ...,0.0,7.0,2015 ...,July ...,27.0,1.0,0.0,1.0,1.0,...,No Deposit,,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02
4,Resort Hotel ...,0.0,13.0,2015 ...,July ...,27.0,1.0,0.0,1.0,1.0,...,No Deposit,304.0,,0.0,Transient,75.0,0.0,0.0,Check-Out,2015-07-02


## Release Resources

In [19]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [20]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>