In [1]:
## feature engineering using pipelines

In [2]:
## import libs
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer ## to fill numerical nan values
from sklearn.base import BaseEstimator,TransformerMixin # for creating Transformers
from sklearn.pipeline import Pipeline #for creating Pipelines
from sklearn.compose import ColumnTransformer # for creating ColumnTransformer

In [3]:
train_set=pd.read_csv('train.csv')

In [4]:
## using just SimpleImputer
a=train_set[[feature for feature in train_set.columns if train_set[feature].dtype!='O']]
type(SimpleImputer(strategy='median').fit_transform(a))

numpy.ndarray

In [5]:
class CustomSimpleImputer(BaseEstimator, TransformerMixin):
    def fit(self,X):
        return self
    
    def transform(self,X:pd.DataFrame):
        X=X.copy()
        num_features_nan=[feature for feature in X.columns if X[feature].isnull().sum()>0]
        for feature in num_features_nan:
            X[feature+"_nan"]=np.where(X[feature].isnull(),1,0)
        dataset= pd.DataFrame(SimpleImputer(strategy='median').fit_transform(X),columns=X.columns)
        dataset=dataset.drop('Id',axis=1)
        return dataset
        

In [6]:
## we need to create features representing the difference between SaleYr & other year features 
class YearTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns=[]
        pass
    def fit(self,X):
        return self #nothing to do
    def transform(self,X:pd.DataFrame):
        year_features=[feature for feature in X.columns if ('Year'in feature or 'Yr'in feature)and('nan'not in feature)]
        for feature in year_features:
            if feature!='YrSold':
                X[feature+'Diff']=X['YrSold']-X[feature]
        self.columns=X.columns
        return X
    
        
        

In [7]:
print([feature for feature in train_set.columns if ('Year'in feature or 'Yr'in feature)and('nan'not in feature)])
       

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']


In [8]:
## Num Pipeline
num_pipeline=Pipeline([
    ('imputer',CustomSimpleImputer()),
    ('year',YearTransformer())
])

In [9]:
num_pipeline.fit_transform(a)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,MiscVal,MoSold,YrSold,SalePrice,LotFrontage_nan,MasVnrArea_nan,GarageYrBlt_nan,YearBuiltDiff,YearRemodAddDiff,GarageYrBltDiff
0,60.0,65.0,8450.0,7.0,5.0,2003.0,2003.0,196.0,706.0,0.0,...,0.0,2.0,2008.0,208500.0,0.0,0.0,0.0,5.0,5.0,5.0
1,20.0,80.0,9600.0,6.0,8.0,1976.0,1976.0,0.0,978.0,0.0,...,0.0,5.0,2007.0,181500.0,0.0,0.0,0.0,31.0,31.0,31.0
2,60.0,68.0,11250.0,7.0,5.0,2001.0,2002.0,162.0,486.0,0.0,...,0.0,9.0,2008.0,223500.0,0.0,0.0,0.0,7.0,6.0,7.0
3,70.0,60.0,9550.0,7.0,5.0,1915.0,1970.0,0.0,216.0,0.0,...,0.0,2.0,2006.0,140000.0,0.0,0.0,0.0,91.0,36.0,8.0
4,60.0,84.0,14260.0,8.0,5.0,2000.0,2000.0,350.0,655.0,0.0,...,0.0,12.0,2008.0,250000.0,0.0,0.0,0.0,8.0,8.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60.0,62.0,7917.0,6.0,5.0,1999.0,2000.0,0.0,0.0,0.0,...,0.0,8.0,2007.0,175000.0,0.0,0.0,0.0,8.0,7.0,8.0
1456,20.0,85.0,13175.0,6.0,6.0,1978.0,1988.0,119.0,790.0,163.0,...,0.0,2.0,2010.0,210000.0,0.0,0.0,0.0,32.0,22.0,32.0
1457,70.0,66.0,9042.0,7.0,9.0,1941.0,2006.0,0.0,275.0,0.0,...,2500.0,5.0,2010.0,266500.0,0.0,0.0,0.0,69.0,4.0,69.0
1458,20.0,68.0,9717.0,5.0,6.0,1950.0,1996.0,0.0,49.0,1029.0,...,0.0,4.0,2010.0,142125.0,0.0,0.0,0.0,60.0,14.0,60.0


In [10]:
# categorical null Transformer
class CatNullTransformer(BaseEstimator,TransformerMixin):
    def __init__(self):
        pass
    def fit(self,X):
        return self
    def transform(self,X:pd.DataFrame):
        X=X.copy()
        cat_features_nan=[feature for feature in X.columns if X[feature].dtype=='O'and X[feature].isnull().sum()>0]
        for feature in cat_features_nan:
            X[feature+'_nan']=np.where(X[feature].isnull(),1,0)
            X[feature].fillna('Missing',inplace=True)
        return X
        

In [11]:
CatNullTransformer().fit_transform(train_set[[feature for feature in train_set.columns if train_set[feature].dtype=='O']])

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,BsmtFinType2_nan,Electrical_nan,FireplaceQu_nan,GarageType_nan,GarageFinish_nan,GarageQual_nan,GarageCond_nan,PoolQC_nan,Fence_nan,MiscFeature_nan
0,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,0,0,1,0,0,0,0,1,1,1
1,RL,Pave,Missing,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,0,0,0,0,0,0,0,1,1,1
2,RL,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,0,0,0,0,0,0,0,1,1,1
3,RL,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,0,0,0,0,0,0,0,1,1,1
4,RL,Pave,Missing,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,0,0,0,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,...,0,0,0,0,0,0,0,1,1,1
1456,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,...,0,0,0,0,0,0,0,1,0,1
1457,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,...,0,0,0,0,0,0,0,1,0,0
1458,RL,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,...,0,0,1,0,0,0,0,1,1,1


In [12]:
# categorical Encoder
class CatEncodeTransformer(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.columns=[]
        pass
    def fit(self,X):
        return self
    def transform(self,X:pd.DataFrame):
        X=X.copy()
        cat_features=[feature for feature in X.columns if X[feature].dtype=='O']
        dataset= pd.get_dummies(X,prefix=cat_features,columns=cat_features)
        self.columns=dataset.columns
        return dataset
        

In [13]:
CatEncodeTransformer().fit_transform(train_set)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,0,0,1,0,0,0,0,1,0
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,0,0,0,1,0,0,0,0,1,0
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,0,0,1,0,0,0,0,1,0
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,0,0,0,1,0,0,0,0,1,0


In [14]:
cat_pipeline=Pipeline([
    ('cat_imputer',CatNullTransformer()),
    ('cat_encoder',CatEncodeTransformer())
])

In [15]:
cat_pipeline.fit_transform(train_set)

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,62.0,7917,6,5,1999,2000,0.0,0,...,0,0,0,1,0,0,0,0,1,0
1456,1457,20,85.0,13175,6,6,1978,1988,119.0,790,...,0,0,0,1,0,0,0,0,1,0
1457,1458,70,66.0,9042,7,9,1941,2006,0.0,275,...,0,0,0,1,0,0,0,0,1,0
1458,1459,20,68.0,9717,5,6,1950,1996,0.0,49,...,0,0,0,1,0,0,0,0,1,0


In [16]:
num_features=[feature for feature in train_set.columns if train_set[feature].dtype!='O']
cat_features=[feature for feature in train_set.columns if feature not in num_features]

In [17]:
full_pipeline=ColumnTransformer([
    ("num",num_pipeline,num_features),
    ("cat",cat_pipeline,cat_features)
])

In [18]:
transformed_train_set=full_pipeline.fit_transform(train_set)

In [19]:
print(type(transformed_train_set))
transformed_train_set

<class 'numpy.ndarray'>


array([[6.000e+01, 6.500e+01, 8.450e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.000e+01, 8.000e+01, 9.600e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [6.000e+01, 6.800e+01, 1.125e+04, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [7.000e+01, 6.600e+01, 9.042e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.000e+01, 6.800e+01, 9.717e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.000e+01, 7.500e+01, 9.937e+03, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [20]:
cat_features_new = full_pipeline.named_transformers_['cat']['cat_encoder'].columns
num_features_new = full_pipeline.named_transformers_['num']['year'].columns
new_columns = np.append( num_features_new,cat_features_new)

In [21]:
new_columns

array(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice', 'LotFrontage_nan',
       'MasVnrArea_nan', 'GarageYrBlt_nan', 'YearBuiltDiff',
       'YearRemodAddDiff', 'GarageYrBltDiff', 'Alley_nan',
       'MasVnrType_nan', 'BsmtQual_nan', 'BsmtCond_nan',
       'BsmtExposure_nan', 'BsmtFinType1_nan', 'BsmtFinType2_nan',
       'Electrical_nan', 'FireplaceQu_nan', 'GarageType_nan',
       'GarageFinish_nan', 'GarageQual_nan', 'GarageCond_nan',
       'PoolQC_nan', 'Fence_nan', 'M

In [22]:
final_train_set=pd.DataFrame(transformed_train_set,columns=new_columns)


In [23]:
final_train_set.to_csv('final_train_set.csv',index=False)

In [24]:
test_set=pd.read_csv('test.csv')

In [25]:
num_test_features=[feature for feature in test_set.columns if test_set[feature].dtype!='O']
cat_test_features=[feature for feature in test_set.columns if feature not in num_features]

In [26]:
full_test_pipeline=ColumnTransformer([
    ("num_test",num_pipeline,num_test_features),
    ("cat_test",cat_pipeline,cat_test_features)
])

In [27]:
full_test_pipeline.fit_transform(test_set)

array([[2.0000e+01, 8.0000e+01, 1.1622e+04, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [2.0000e+01, 8.1000e+01, 1.4267e+04, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [6.0000e+01, 7.4000e+01, 1.3830e+04, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       ...,
       [2.0000e+01, 1.6000e+02, 2.0000e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [8.5000e+01, 6.2000e+01, 1.0441e+04, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00],
       [6.0000e+01, 7.4000e+01, 9.6270e+03, ..., 0.0000e+00, 1.0000e+00,
        0.0000e+00]])

In [28]:
cat_features_test_new = full_test_pipeline.named_transformers_['cat_test']['cat_encoder'].columns
num_features_test_new = full_test_pipeline.named_transformers_['num_test']['year'].columns
new_test_columns = np.append( num_features_test_new,cat_features_test_new)

In [29]:
test_set_final=pd.DataFrame(full_test_pipeline.fit_transform(test_set),columns=new_test_columns)

In [30]:
test_set_final

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Missing,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,20.0,80.0,11622.0,5.0,6.0,1961.0,1961.0,0.0,468.0,144.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,20.0,81.0,14267.0,6.0,6.0,1958.0,1958.0,108.0,923.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,60.0,74.0,13830.0,5.0,5.0,1997.0,1998.0,0.0,791.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,60.0,78.0,9978.0,6.0,6.0,1998.0,1998.0,20.0,602.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,120.0,43.0,5005.0,8.0,5.0,1992.0,1992.0,0.0,263.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160.0,21.0,1936.0,4.0,7.0,1970.0,1970.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1455,160.0,21.0,1894.0,4.0,5.0,1970.0,1970.0,0.0,252.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1456,20.0,160.0,20000.0,5.0,7.0,1960.0,1996.0,0.0,1224.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
1457,85.0,62.0,10441.0,5.0,5.0,1992.0,1992.0,0.0,337.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
test_set_final.to_csv('final_test_set.csv',index=False) 
# index=False means there won't be an additional column specifying index 