In [3]:
import sagemaker
import boto3

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml


## Download the csv files from S3 and store in data folder

In [4]:
!aws s3 cp s3://raw-olist-ecommerce/olist_data/ ./Data/ --recursive --exclude "*" --include "*.csv"

download: s3://raw-olist-ecommerce/olist_data/product_category_name_translation.csv to Data/product_category_name_translation.csv
download: s3://raw-olist-ecommerce/olist_data/sellers_dataset.csv to Data/sellers_dataset.csv
download: s3://raw-olist-ecommerce/olist_data/customers_dataset.csv to Data/customers_dataset.csv
download: s3://raw-olist-ecommerce/olist_data/order_payments_dataset.csv to Data/order_payments_dataset.csv
download: s3://raw-olist-ecommerce/olist_data/products_dataset.csv to Data/products_dataset.csv
download: s3://raw-olist-ecommerce/olist_data/order_items_dataset.csv to Data/order_items_dataset.csv
download: s3://raw-olist-ecommerce/olist_data/order_reviews_dataset.csv to Data/order_reviews_dataset.csv
download: s3://raw-olist-ecommerce/olist_data/orders_dataset.csv to Data/orders_dataset.csv
download: s3://raw-olist-ecommerce/olist_data/geolocation_dataset.csv to Data/geolocation_dataset.csv


## Setup connection to Athena DB

In [5]:
# !pip install --disable-pip-version-check -q PyAthena==2.1.0

In [6]:
import pandas as pd
from pyathena import connect

In [7]:
# set check for creating athena DB
create_athena_db_passed = False
# set db name
database_name = "ads_508_team_5"

In [8]:
# Set S3 staging directory -- this is a temporary directory used for Athena queries
s3_staging_dir = "s3://{0}/athena/staging".format(bucket)

In [9]:
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

In [10]:
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
print(statement)

CREATE DATABASE IF NOT EXISTS ads_508_team_5


In [11]:
pd.read_sql(statement, conn)

## Verify The Database Has Been Created Succesfully

In [12]:
statement = "SHOW DATABASES"

db_show = pd.read_sql(statement, conn)
db_show.head(5)

Unnamed: 0,database_name
0,ads_508_team_5
1,default


In [13]:
if database_name in db_show.values:
    ingest_create_athena_db_passed = True

In [14]:
%store ingest_create_athena_db_passed

Stored 'ingest_create_athena_db_passed' (bool)


## Create tables in Athena DB Dynamically

* We will be using the CSV files in the data folder and storing them as table in Athena, so we can then query them in the Refined layer notebook via SQL

In [15]:
# create a Boto3 client for Athena
athena_client = boto3.client("athena", region_name = region)

In [16]:
# Bring down the database name again 
database_name = "ads_508_team_5"
# create my S3 path to the whole bucket
s3_path_olist = "s3://raw-olist-ecommerce/olist_data/"

In [17]:
## Manuel create all teh table for Athena
customer_table_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.customers_table(
    customer_id string,
          customer_unique_id string,
          customer_zip_code_prefix int,
          customer_city string,
          customer_state string
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/customers_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)

geolocation_table_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.geolocation_table(
          geolocation_zip_code_prefix int,
          geolocation_lat double,
          geolocation_lng double,
          geolocation_city string,
          geolocation_state string
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/geolocation_dataset/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)

order_items_table_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.order_items_table(
          order_id string,
          order_item_id int,
          product_id string,
          seller_id string,
          shipping_limit_date timestamp,
          price double,
          freight_value double
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/order_items_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)

order_reviews_table_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.order_reviews_table(
          review_id string,
          order_id string,
          review_score int,
          review_comment_message string,
          review_creation_date timestamp,
          review_answer_timestamp timestamp
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/order_reviews_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)

orders_table_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.orders_table(
          order_id string,
          customer_id string,
          order_status string,
          order_purchase_timestamp timestamp,
          order_approved_at timestamp,
          order_delivered_carrier_date timestamp,
          order_delivered_customer_date timestamp,
          order_estimated_delivery_date timestamp
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/orders_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)

order_payments_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.order_payments_table(
          order_id string,
          payment_sequential int,
          payment_type string,
          payment_installments int,
          payment_value double
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/prder_payments_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)


product_category_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.product_category_table(
          product_category_name string,
          product_category_name_english string
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/product_cat_name_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)

products_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.product_table(
          product_id string,
          product_category_name string,
          product_name_length int,
          product_description_length int,
          product_photos_qty int,
          product_weight_g double,
          product_length_cm double,
          product_height_cm double,
          product_width_cm double
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/products_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)

sellers_statement = """CREATE EXTERNAL TABLE IF NOT EXISTS {}.sellers_table(
          seller_id string,
          seller_zip_code_prefix int,
          seller_city string,
          seller_state string
          )
          ROW FORMAT DELIMITED
          FIELDS TERMINATED BY ','
          STORED AS TEXTFILE
          LOCATION 's3://raw-olist-ecommerce/sellers_data/'
          TBLPROPERTIES ("skip.header.line.count"="1");""".format(database_name)


In [56]:
write_cust_table = pd.read_sql(customer_table_statement, conn)
write_geo_table = pd.read_sql(geolocation_table_statement, conn)
write_order_items_table = pd.read_sql(order_items_table_statement, conn)
write_order_reviews_table = pd.read_sql(order_reviews_table_statement, conn)
write_orders_table = pd.read_sql(orders_table_statement, conn)
write_order_payments_table = pd.read_sql(order_payments_statement, conn)
write_prod_cat_table = pd.read_sql(product_category_statement, conn)
write_product_table = pd.read_sql(products_statement, conn)
write_seller_table = pd.read_sql(sellers_statement, conn)

In [59]:
test_statment = 'SELECT * FROM {}.order_items_table limit 5'.format(database_name)
test_output = pd.read_sql(test_statment, conn)
test_output

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,"""00010242fe8c5a6d1ba2dd792cb16214""",1,"""4244733e06e7ecb4970a6e2683c13e61""","""48436dade18ac8b2bce089ec2a041202""",2017-09-19 09:45:35,58.9,13.29
1,"""00018f77f2f0320c557190d7a144bdd3""",1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,"""000229ec398224ef6ca0657da4fc703e""",1,c777355d18b72b67abbeef9df44fd0fd,"""5b51032eddd242adc84c38acab88f23d""",2018-01-18 14:48:30,199.0,17.87
3,"""00024acbcdf0a6daa1e931b038114c75""",1,"""7634da152a4610f1595efa32f14722fc""","""9d7a1d34a5052409006425275ba1c2b4""",2018-08-15 10:10:18,12.99,12.79
4,"""00042b26cf59d7ce69dfabb4e55b4fd9""",1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14


In [60]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}