# Load data

In [1]:
import pandas as pd
path = 'data_kaggle_house_prices/'

In [2]:
train = pd.read_csv(path + 'train_clean.csv')
test = pd.read_csv(path + 'test_clean.csv')

cat_f = train.select_dtypes(exclude='number').columns.tolist()
num_f = train.select_dtypes(include='number').columns.tolist()

train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Age
0,1,60,RL,65.0,8450,1,,1,Lvl,1,...,0.0,0.0,,0,2,2008,WD,Normal,208500,19
1,2,20,RL,80.0,9600,1,,1,Lvl,1,...,0.0,0.0,,0,5,2007,WD,Normal,181500,46
2,3,60,RL,68.0,11250,1,,0,Lvl,1,...,0.0,0.0,,0,9,2008,WD,Normal,223500,21
3,4,70,RL,60.0,9550,1,,0,Lvl,1,...,0.0,0.0,,0,2,2006,WD,Abnorml,140000,107
4,5,60,RL,84.0,14260,1,,0,Lvl,1,...,0.0,0.0,,0,12,2008,WD,Normal,250000,22


# Create Pipeline

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [17]:
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="constant", fill_value=0))]
)

categorical_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="constant", fill_value='None')), 
           ("ohe", OneHotEncoder(sparse=False))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_f),
        ("cat", categorical_transformer, cat_f),
    ]
)

In [8]:
train_preprocessor = preprocessor.fit_transform(train)

In [10]:
train_preprocessed = pd.DataFrame(train_preprocessor, 
                                 columns=num_f + preprocessor.named_transformers_['cat']['ohe'].get_feature_names() \
                                 .tolist())
train_preprocessed.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,Street,Alley,LotShape,Utilities,OverallQual,OverallCond,...,x21_ConLw,x21_New,x21_Oth,x21_WD,x22_Abnorml,x22_AdjLand,x22_Alloca,x22_Family,x22_Normal,x22_Partial
0,1.0,60.0,65.0,8450.0,1.0,0.0,1.0,1.0,7.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2.0,20.0,80.0,9600.0,1.0,0.0,1.0,1.0,6.0,8.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3.0,60.0,68.0,11250.0,1.0,0.0,0.0,1.0,7.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4.0,70.0,60.0,9550.0,1.0,0.0,0.0,1.0,7.0,5.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5.0,60.0,84.0,14260.0,1.0,0.0,0.0,1.0,8.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [11]:
train_preprocessed.to_csv(path+'train_preprocessed.csv')

In [23]:
preprocessor_test = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, list(set(num_f)-set(['SalePrice']))),
        ("cat", categorical_transformer, cat_f),
    ]
)

test_preprocessor = preprocessor_test.fit_transform(test)

In [25]:
test_preprocessed = pd.DataFrame(test_preprocessor, 
                                 columns=list(set(num_f)-set(['SalePrice'])) +\
                                 preprocessor_test.named_transformers_['cat']['ohe'].get_feature_names() \
                                 .tolist())

In [27]:
test_preprocessed.to_csv(path+'test_preprocessed.csv')