# Set Up Data in Athena

## Import libraries

In [1]:
import boto3
import os
import pandas as pd
import sagemaker
import awswrangler as wr
import warnings

from IPython.core.display import HTML
from pyathena import connect

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Filter out the specific UserWarning related to DBAPI2 objects
warnings.filterwarnings('ignore', message="pandas only supports SQLAlchemy connectable")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# check stored variables
%store

Stored variables and their in-db values:
bucket_name                            -> 'housing-dataset-5660'
set_up_dependencies_passed             -> True
set_up_s3_bucket_passed                -> True


## Setup Table in Athena

In [3]:
# save Amazon information
account_id = boto3.client("sts").get_caller_identity().get("Account")
region = boto3.Session().region_name
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()
s3 = boto3.client('s3', region_name=sagemaker_session.boto_region_name)

In [4]:
# get bucket_name
%store -r bucket_name
print(bucket_name)

housing-dataset-5660


In [5]:
# set database name
database_name = "housing"

# set S3 staging directory
s3_staging_dir = "s3://{0}/athena/staging".format(bucket_name)

# create connection
conn = connect(region_name=region, s3_staging_dir=s3_staging_dir)

# create database if it doesn't exist
statement = "CREATE DATABASE IF NOT EXISTS {}".format(database_name)
pd.read_sql(statement, conn)

In [6]:
# verify the database has been created
databases = wr.catalog.databases()
df_show = pd.DataFrame(databases)
df_show.head(5)

Unnamed: 0,Database,Description
0,default,
1,housing,


In [7]:
# set table name
table_name = "data"

# S3 path to dataset
s3_path = 's3://{}/data/processed/'.format(bucket_name)

# drop the table if it already exists
statement = f"DROP TABLE IF EXISTS {database_name}.{table_name}"
pd.read_sql(statement, conn)

# create table statement updated to reflect the actual structure
statement = f"""CREATE EXTERNAL TABLE IF NOT EXISTS {database_name}.{table_name}(
    Id INTEGER,
    MSSubClass INTEGER,
    MSZoning VARCHAR(255),
    LotFrontage FLOAT,
    LotArea INTEGER,
    Street VARCHAR(255),
    Alley VARCHAR(255),
    LotShape VARCHAR(255),
    LandContour VARCHAR(255),
    Utilities VARCHAR(255),
    LotConfig VARCHAR(255),
    LandSlope VARCHAR(255),
    Neighborhood VARCHAR(255),
    Condition1 VARCHAR(255),
    Condition2 VARCHAR(255),
    BldgType VARCHAR(255),
    HouseStyle VARCHAR(255),
    OverallQual INTEGER,
    OverallCond INTEGER,
    YearBuilt INTEGER,
    YearRemodAdd INTEGER,
    RoofStyle VARCHAR(255),
    RoofMatl VARCHAR(255),
    Exterior1st VARCHAR(255),
    Exterior2nd VARCHAR(255),
    MasVnrType VARCHAR(255),
    MasVnrArea FLOAT,
    ExterQual VARCHAR(255),
    ExterCond VARCHAR(255),
    Foundation VARCHAR(255),
    BsmtQual VARCHAR(255),
    BsmtCond VARCHAR(255),
    BsmtExposure VARCHAR(255),
    BsmtFinType1 VARCHAR(255),
    BsmtFinSF1 FLOAT,
    BsmtFinType2 VARCHAR(255),
    BsmtFinSF2 FLOAT,
    BsmtUnfSF FLOAT,
    TotalBsmtSF FLOAT,
    Heating VARCHAR(255),
    HeatingQC VARCHAR(255),
    CentralAir VARCHAR(255),
    Electrical VARCHAR(255),
    FirstFlrSF INTEGER,
    SecondFlrSF INTEGER,
    LowQualFinSF INTEGER,
    GrLivArea INTEGER,
    BsmtFullBath FLOAT,
    BsmtHalfBath FLOAT,
    FullBath INTEGER,
    HalfBath INTEGER,
    BedroomAbvGr INTEGER,
    KitchenAbvGr INTEGER,
    KitchenQual VARCHAR(255),
    TotRmsAbvGrd INTEGER,
    Functional VARCHAR(255),
    Fireplaces INTEGER,
    FireplaceQu VARCHAR(255),
    GarageType VARCHAR(255),
    GarageYrBlt FLOAT,
    GarageFinish VARCHAR(255),
    GarageCars FLOAT,
    GarageArea FLOAT,
    GarageQual VARCHAR(255),
    GarageCond VARCHAR(255),
    PavedDrive VARCHAR(255),
    WoodDeckSF INTEGER,
    OpenPorchSF INTEGER,
    EnclosedPorch INTEGER,
    ThreeSsnPorch INTEGER,
    ScreenPorch INTEGER,
    PoolArea INTEGER,
    PoolQC VARCHAR(255),
    Fence VARCHAR(255),
    MiscFeature VARCHAR(255),
    MiscVal INTEGER,
    MoSold INTEGER,
    YrSold INTEGER,
    SaleType VARCHAR(255),
    SaleCondition VARCHAR(255),
    SalePrice FLOAT
) 
ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LOCATION '{s3_path}'
TBLPROPERTIES ('skip.header.line.count'='1')
"""
pd.read_sql(statement, conn)

In [8]:
# verify table has been created
tables = wr.catalog.tables(database=database_name)
df_show = pd.DataFrame(tables)
df_show.head(5)

Unnamed: 0,Database,Table,Description,TableType,Columns,Partitions
0,housing,data,,EXTERNAL_TABLE,"id, mssubclass, mszoning, lotfrontage, lotarea...",


In [9]:
# pull data
statement = """SELECT * FROM {}.{}""".format(database_name, table_name)
df = wr.athena.read_sql_query(statement, database=database_name)
df.head(10)

2024-06-29 05:05:37,946	INFO worker.py:1553 -- Started a local Ray instance.


Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000.0
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000.0
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000.0
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900.0
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000.0


In [10]:
# confirm the saleprice column stays there
df['saleprice'].head(2)

0    208500.0
1    181500.0
Name: saleprice, dtype: float32

## Shut down notebook resources

In [11]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>

In [12]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>