# Import Packages

In [1]:
import os
import sys
import pandas as pd

project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

from utils.helpers import load_data
from data.feature_engineering import handle_rare_categories, encode_categories, normalize_numerics

# Loading Data

In [2]:
data = load_data("train_cleaned.csv")

# Handle Rare Categorical Variables
1. **MSSubClass:** Categories with with fewer than, 20 observations have been put into a category named 'Other'

In [3]:
data = handle_rare_categories(data)

# Encode Categorical Variables
1. **MSSubClass:** We will use frequency encoding which reduces spacity and will account for the rarity of other categories
2. **MSZoning:** We will use onehot encoding

In [4]:
data = encode_categories(data)

# Normalize Numerical Features

In [5]:
data = normalize_numerics(data)

In [6]:
data.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,MSSubClass_Encoded,MSZoning_Floating Village Residential,MSZoning_Residential High Density,MSZoning_Residential Low Density,MSZoning_Residential Medium Density
0,1,2-STORY 1946 & NEWER,0.377974,8450,Paved,,Regular,Near Flat/Level,All Public Utilities,Inside Lot,...,2,2008,Warranty Deed - Conventional,Normal Sale,208500,299,0.0,0.0,1.0,0.0
1,2,1-STORY 1946 & NEWER ALL STYLES,0.506829,9600,Paved,,Regular,Near Flat/Level,All Public Utilities,Frontage on 2 Sides,...,5,2007,Warranty Deed - Conventional,Normal Sale,181500,536,0.0,0.0,1.0,0.0
2,3,2-STORY 1946 & NEWER,0.403745,11250,Paved,,Slightly Irregular,Near Flat/Level,All Public Utilities,Inside Lot,...,9,2008,Warranty Deed - Conventional,Normal Sale,223500,299,0.0,0.0,1.0,0.0
3,4,2-STORY 1945 & OLDER,0.335023,9550,Paved,,Slightly Irregular,Near Flat/Level,All Public Utilities,Corner Lot,...,2,2006,Warranty Deed - Conventional,Abnormal Sale,140000,60,0.0,0.0,1.0,0.0
4,5,2-STORY 1946 & NEWER,0.541191,14260,Paved,,Slightly Irregular,Near Flat/Level,All Public Utilities,Frontage on 2 Sides,...,12,2008,Warranty Deed - Conventional,Normal Sale,250000,299,0.0,0.0,1.0,0.0


In [8]:
from utils.plotting import DataVisualizer

visualizer = DataVisualizer(data)

In [None]:
visualizer.plot('LotFrontage')

In [None]:
visualizer.plot('LotFrontage', 'SalePrice')