In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

import warnings
warnings.filterwarnings('ignore')

### Preview Data

In [2]:
data = pd.read_csv('Housing_Prices/train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Create Train and Test Sets

In [3]:
# Create X and y
y = data[['SalePrice']]
X = data.drop(columns=['SalePrice', 'Id'], axis=1)

# check the columns with low values to see if they are objects (thus likely categorical)
cat_features = [col for col in X.columns if X[col].dtype in [object]]
cont_features = [col for col in X.columns if col not in cat_features]

# sanity check
print(f"Categorical Features: {len(cat_features)}")
print(sorted(cat_features))
print()
print(f"Continuous Features: {len(cont_features)}")
print(sorted(cont_features))

Categorical Features: 43
['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig', 'LotShape', 'MSZoning', 'MasVnrType', 'MiscFeature', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street', 'Utilities']

Continuous Features: 36
['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MSSubClass', 'MasVnrArea', 'MiscVal', 'MoSold', 'OpenPorchSF', 'OverallCo

### Isolate Test Set

In [4]:
# Create X and y
y = data[['SalePrice']]
X = data.drop(columns=['SalePrice', 'Id'], axis=1)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [5]:
# Save y
y_train.to_csv('data/y_train.csv', index=False)
y_val.to_csv('data/y_val.csv', index=False)
y_test.to_csv('data/y_test.csv', index=False)

#### Numeric Features

In [6]:
# Remove "object"-type features from X
cont_features = [col for col in X.columns if X[col].dtype not in [object]]

# Remove "object"-type features from X_train and X_test
X_train_cont = X_train.loc[:, cont_features]
X_val_cont = X_val.loc[:, cont_features]
X_test_cont = X_test.loc[:, cont_features]

# Impute missing values with median using SimpleImputer
impute = SimpleImputer(strategy='median')
X_train_numeric = pd.DataFrame(impute.fit_transform(X_train_cont), columns=cont_features)
X_val_numeric = pd.DataFrame(impute.transform(X_val_cont), columns=cont_features)
X_test_numeric = pd.DataFrame(impute.transform(X_test_cont), columns=cont_features)

In [7]:
# Save numeric X
X_train_numeric.to_csv('data/X_train_numeric.csv', index=False)
X_val_numeric.to_csv('data/X_val_numeric.csv', index=False)
X_test_numeric.to_csv('data/X_test_numeric.csv', index=False)

#### Categorical Features

In [8]:
# Create X_cat which contains only the categorical variables
cat_features = [col for col in X.columns if X[col].dtype in [object]]
X_train_cat = X_train.loc[:, cat_features]
X_val_cat = X_val.loc[:, cat_features]
X_test_cat = X_test.loc[:, cat_features]

# Fill missing values with the string 'missing'
X_train_cat.fillna(value='missing', inplace=True)
X_val_cat.fillna(value='missing', inplace=True)
X_test_cat.fillna(value='missing', inplace=True)

In [9]:
# OneHotEncode categorical variables
ohe = OneHotEncoder(drop='if_binary', 
                    sparse_output=False, 
                    handle_unknown='infrequent_if_exist', 
                    min_frequency=0.01, 
                    max_categories=8)

# Transform training and test sets
X_train_ohe = ohe.fit_transform(X_train_cat)
X_val_ohe = ohe.transform(X_val_cat)
X_test_ohe = ohe.transform(X_test_cat)

# Convert these columns into a DataFrame 
cat_columns = ohe.get_feature_names_out(input_features=X_train_cat.columns)
X_train_cat = pd.DataFrame(X_train_ohe, columns=cat_columns)
X_val_cat = pd.DataFrame(X_val_ohe, columns=cat_columns)
X_test_cat = pd.DataFrame(X_test_ohe, columns=cat_columns)

X_val_cat = pd.DataFrame(X_val_ohe)
X_train_cat = pd.DataFrame(X_train_ohe)
X_test_cat = pd.DataFrame(X_test_ohe)

print(len(X_val_cat))
print(len(X_train_cat))
print(len(X_test_cat))

In [10]:
# Save categorical X
X_val_cat.to_csv('data/X_val_cat.csv', index=False)
X_train_cat.to_csv('data/X_train_cat.csv', index=False)
X_test_cat.to_csv('data/X_test_cat.csv', index=False)

263
1051
146
