# Packages 

In [1]:
import boto3
import sagemaker
from pyathena import connect

import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set_style = "seaborn-whitegrid"

sns.set(
    rc={
        "font.style": "normal",
        "axes.facecolor": "white",
        "grid.color": ".8",
        "grid.linestyle": "-",
        "figure.facecolor": "white",
        "figure.titlesize": 20,
        "text.color": "black",
        "xtick.color": "black",
        "ytick.color": "black",
        "axes.labelcolor": "black",
        "axes.grid": True,
        "axes.labelsize": 10,
        "xtick.labelsize": 10,
        "font.size": 10,
        "ytick.labelsize": 10,
    }
)

In [2]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [3]:
ingest_create_athena_db_passed = False

# Create Athena Database

In [4]:
# Set Athena database & table
database_name = "ads508"
table_name = "flight_departure_delays"

In [5]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/ads508/athena/staging".format(bucket)

In [6]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [7]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS ads508


In [8]:
pd.read_sql(statement, conn)

# Verify The Database Has Been Created Succesfully

In [9]:
statement = "SHOW DATABASES"

df_show = pd.read_sql(statement, conn)
df_show.head(5)

Unnamed: 0,database_name
0,ads508
1,default
2,dsoaws


In [10]:
if database_name in df_show.values:
    ingest_create_athena_db_passed = True

In [11]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


# Download Data from Public S3 Bucket

In [12]:
# Public Flight Data
s3_client = boto3.client("s3")

BUCKET='ads-508-airline'
KEY='transformed/ON_TIME_REPORTING_12.csv'

response = s3_client.get_object(Bucket=BUCKET, Key=KEY)
dec_flight = pd.read_csv(response.get("Body"))
dec_flight.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_UNIQUE_CARRIER,TAIL_NUM,ORIGIN,DEST,DEP_DEL15,DEP_TIME_BLK,ARR_TIME_BLK,CANCELLED,CRS_ELAPSED_TIME,DISTANCE,DISTANCE_GROUP,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,8,7,WN,N8651A,STL,SAN,0.0,1100-1159,1300-1359,0.0,245.0,1557.0,7,0.0,0.0,18.0,0.0,0.0
1,8,7,WN,N939WN,STL,SAT,0.0,1200-1259,1400-1459,0.0,145.0,786.0,4,,,,,
2,8,7,WN,N7741C,STL,SAT,0.0,2100-2159,0001-0559,0.0,140.0,786.0,4,,,,,
3,8,7,WN,N550WN,STL,SEA,0.0,0900-0959,1200-1259,0.0,275.0,1709.0,7,,,,,
4,8,7,WN,N8319F,STL,SFO,1.0,1800-1859,2000-2059,0.0,270.0,1735.0,7,,,,,


In [13]:
dec_flight.to_csv('ON_TIME_REPORTING_12.csv', encoding='utf-8', index=False)

# Set S3 Destination Location(Our S3 Private Bucket)

In [14]:
s3_private_path_csv = "s3://{}/ads508/data".format(bucket)
print(s3_private_path_csv)

s3://sagemaker-us-east-1-229768475194/ads508/data


In [15]:
%store s3_private_path_csv

Stored 's3_private_path_csv' (str)


# Copy Downloaded Local Data to our Private S3 Bucket in this Account

In [16]:
file_path = 'ON_TIME_REPORTING_12.csv'

In [17]:
!aws s3 cp $file_path $s3_private_path_csv/ 

upload: ./ON_TIME_REPORTING_12.csv to s3://sagemaker-us-east-1-229768475194/ads508/data/ON_TIME_REPORTING_12.csv


In [18]:
!aws s3 ls $s3_private_path_csv/

2022-03-21 07:39:34   45671905 ON_TIME_REPORTING_12.csv


# Create Table in Database

In [25]:
# SQL statement to execute
statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.{}(
         DAY_OF_MONTH int,
         DAY_OF_WEEK int,
         OP_UNIQUE_CARRIER string,
         TAIL_NUM string,
         ORIGIN string,
         DEST string,
         DEP_DEL15 int,
         DEP_TIME_BLK string,
         ARR_TIME_BLK string,
         CANCELLED int,
         CRS_ELAPSED_TIME int,
         DISTANCE int,
         DISTANCE_GROUP int,
         CARRIER_DELAY int,
         WEATHER_DELAY int,
         NAS_DELAY int,
         SECURITY_DELAY int,
         LATE_AIRCRAFT_DELAY int
         
        
) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
LOCATION '{}'
TBLPROPERTIES (skip.header.line.count'='1')""".format(
    database_name, table_name, s3_private_path_csv
)

print(statement)

CREATE EXTERNAL TABLE IF NOT EXISTS ads508.flight_departure_delays(
         DAY_OF_MONTH int,
         DAY_OF_WEEK int,
         OP_UNIQUE_CARRIER string,
         TAIL_NUM string,
         ORIGIN string,
         DEST string,
         DEP_DEL15 int,
         DEP_TIME_BLK string,
         ARR_TIME_BLK string,
         CANCELLED int,
         CRS_ELAPSED_TIME int,
         DISTANCE int,
         DISTANCE_GROUP int,
         CARRIER_DELAY int,
         WEATHER_DELAY int,
         NAS_DELAY int,
         SECURITY_DELAY int,
         LATE_AIRCRAFT_DELAY int
         
        
) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ','
LINES TERMINATED BY '
'
LOCATION 's3://sagemaker-us-east-1-229768475194/ads508/data'
TBLPROPERTIES (skip.header.line.count'='1')


In [26]:
pd.read_sql(statement, conn)

Failed to execute query.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/pyathena/common.py", line 417, in _execute
    **request
  File "/opt/conda/lib/python3.7/site-packages/pyathena/util.py", line 84, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 404, in __call__
    do = self.iter(retry_state=retry_state)
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 349, in iter
    return fut.result()
  File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 428, in result
    return self.__get_result()
  File "/opt/conda/lib/python3.7/concurrent/futures/_base.py", line 384, in __get_result
    raise self._exception
  File "/opt/conda/lib/python3.7/site-packages/tenacity/__init__.py", line 407, in __call__
    result = fn(*args, **kwargs)
  File "/opt/conda/lib/python3.7/site-packages/botocore/client.py", line 391, in _api_call
    return

DatabaseError: Execution failed on sql: CREATE EXTERNAL TABLE IF NOT EXISTS ads508.flight_departure_delays(
         DAY_OF_MONTH int,
         DAY_OF_WEEK int,
         OP_UNIQUE_CARRIER string,
         TAIL_NUM string,
         ORIGIN string,
         DEST string,
         DEP_DEL15 int,
         DEP_TIME_BLK string,
         ARR_TIME_BLK string,
         CANCELLED int,
         CRS_ELAPSED_TIME int,
         DISTANCE int,
         DISTANCE_GROUP int,
         CARRIER_DELAY int,
         WEATHER_DELAY int,
         NAS_DELAY int,
         SECURITY_DELAY int,
         LATE_AIRCRAFT_DELAY int
         
        
) 
ROW FORMAT DELIMITED 
FIELDS TERMINATED BY ','
LINES TERMINATED BY '
'
LOCATION 's3://sagemaker-us-east-1-229768475194/ads508/data'
TBLPROPERTIES (skip.header.line.count'='1')
An error occurred (InvalidRequestException) when calling the StartQueryExecution operation: line 1:8: mismatched input 'EXTERNAL'. Expecting: 'OR', 'SCHEMA', 'TABLE', 'VIEW'
unable to rollback

## What days of the month are best and worst for departure delays?

# Store Variables for the Next Notebooks

In [None]:
%store

# Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}