# Preprocess Data

## Import libraries

In [1]:
import awswrangler as wr
import boto3
import numpy as np
import json
import os
import pandas as pd
import re
import requests
import sagemaker
import subprocess
import warnings

# suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
# check stored variables
%store

Stored variables and their in-db values:
bucket_name                            -> 'housing-dataset-2111'
set_up_dependencies_passed             -> True
set_up_s3_bucket_passed                -> True


## Load Data from S3

In [3]:
# save Amazon information
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name
account_id = boto3.client("sts").get_caller_identity().get("Account")

In [4]:
# get s3 path to data
%store -r bucket_name
s3_path = 's3://{}/data'.format(bucket_name)
print(s3_path)

s3://housing-dataset-2111/data


In [5]:
df_test = wr.s3.read_csv(path='{}/test.csv'.format(s3_path))
df_train = wr.s3.read_csv(path='{}/train.csv'.format(s3_path))

In [6]:
# overview of train dataset
shape = df_train.shape
print("Shape of the dataframe (row, col):",shape,"\r\n")
df_train.head(5)

Shape of the dataframe (row, col): (1460, 81) 



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
# overview of test dataset
shape = df_test.shape
print("Shape of the dataframe (row, col):",shape,"\r\n")
df_test.head(5)

Shape of the dataframe (row, col): (1459, 80) 



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


## Combine datasets

In [8]:
# merge datasets
df = pd.concat([df_train, df_test])
shape = df.shape
print("Shape of the dataframe (row, col):",shape,"\r\n")
df.head(5)

Shape of the dataframe (row, col): (2919, 81) 



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


In [9]:
# show the y to be predicted from the merged dataset
df['SalePrice'].head(2)

0    208500.0
1    181500.0
Name: SalePrice, dtype: float64

In [10]:
# save path to local dataset
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
local_folder = os.path.join(parent_directory, 'data')
local_path = f'{local_folder}/merged_data.csv'
print(local_path)

# Export merged dataset to a CSV file located in the 'data' folder
df.to_csv(f'{local_path}', index=False, sep=',')

/home/sagemaker-user/AAI_540_SU_04/data/merged_data.csv


In [11]:
# copy data from local to s3
s3_data_path = f'{s3_path}/processed'
!aws s3 cp "$local_path" $s3_data_path/

upload: ../data/merged_data.csv to s3://housing-dataset-2111/data/processed/merged_data.csv


## Shut down notebook resources

In [12]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}

<IPython.core.display.Javascript object>

In [13]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>