# Feature Engineering

In [9]:
import pandas as pd
import copy
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


pd.set_option('display.max_columns', None)

In [10]:
df = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')
print(df.shape)
print(test.shape)

(1460, 81)
(1459, 80)


In [11]:
# Print missing values in test BsmtFinSF1
print(test[test['BsmtFinSF1'].isnull()])

       Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
660  2121          20       RM         99.0     5940   Pave   NaN      IR1   

    LandContour Utilities LotConfig LandSlope Neighborhood Condition1  \
660         Lvl    AllPub       FR3       Gtl      BrkSide      Feedr   

    Condition2 BldgType HouseStyle  OverallQual  OverallCond  YearBuilt  \
660       Norm     1Fam     1Story            4            7       1946   

     YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType  \
660          1950     Gable  CompShg     MetalSd      CBlock        NaN   

     MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure  \
660         0.0        TA        TA      PConc      NaN      NaN          NaN   

    BsmtFinType1  BsmtFinSF1 BsmtFinType2  BsmtFinSF2  BsmtUnfSF  TotalBsmtSF  \
660          NaN         NaN          NaN         NaN        NaN          NaN   

    Heating HeatingQC CentralAir Electrical  1stFlrSF  2ndFlrSF  Low

In [12]:
# Print column names that don't match
print(set(df.columns) - set(test.columns))

{'SalePrice'}


In [13]:
# See if the value 'TenC' appears in the MiscFeature column of either df
print('TenC' in df['MiscFeature'].values)
print('TenC' in test['MiscFeature'].values)

True
False


In [14]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [15]:
def clean(df, enc=None, fit_enc=False):
    # One-hot encoding for specified categorical columns
    one_hot_columns = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
                    'LotConfig', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'Foundation',
                    'BsmtFinType1', 'BsmtFinType2', 'GarageFinish', 'Exterior1st', 'Exterior2nd', 'MiscFeature',
                    'Heating', 'Electrical', 'GarageType', 'SaleType', 'SaleCondition', 'Neighborhood']
    one_hot_df = df[one_hot_columns]
    df = df.drop(columns=one_hot_columns)

    # Only fit the encoder on the training set to preseve columns in the test set
    if fit_enc:
        enc.fit(one_hot_df)
    one_hot_encoded = enc.transform(one_hot_df)
    one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=enc.get_feature_names_out(one_hot_columns))
    # Reset index on df to avoid issues when concatenating
    df.reset_index(drop=True, inplace=True)
    one_hot_encoded_df.reset_index(drop=True, inplace=True)
    df = pd.concat([df, one_hot_encoded_df], axis=1)

    # Define mappings for each feature
    land_slope_mapping = {'Gtl': 1, 'Mod': 2, 'Sev': 3}
    heating_qc_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
    exter_cond_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
    exter_qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
    kitchen_qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
    fireplace_qu_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
    garage_qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
    garage_cond_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
    paved_drive_mapping = {'Y': 2, 'P': 1, 'N': 0}
    pool_qc_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'NA': 0}
    fence_mapping = {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'NA': 0}
    functional_mapping = {'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 2, 'Sev': 1, 'Sal': 0}
    bsmt_qual_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
    bsmt_exposure_mapping = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0}
    bsmt_cond_mapping = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}

    # Apply mappings to the dataframe
    df['LandSlope'] = df['LandSlope'].map(land_slope_mapping)
    df['HeatingQC'] = df['HeatingQC'].map(heating_qc_mapping)
    df['ExterCond'] = df['ExterCond'].map(exter_cond_mapping)
    df['ExterQual'] = df['ExterQual'].map(exter_qual_mapping)
    df['KitchenQual'] = df['KitchenQual'].map(kitchen_qual_mapping)
    df['FireplaceQu'] = df['FireplaceQu'].map(fireplace_qu_mapping).fillna(0)  # Assuming NA means no fireplace
    df['GarageQual'] = df['GarageQual'].map(garage_qual_mapping).fillna(0)    # Assuming NA means no garage
    df['GarageCond'] = df['GarageCond'].map(garage_cond_mapping).fillna(0)    # Assuming NA means no garage
    df['PavedDrive'] = df['PavedDrive'].map(paved_drive_mapping)
    df['PoolQC'] = df['PoolQC'].map(pool_qc_mapping).fillna(0)                # Assuming NA means no pool
    df['Fence'] = df['Fence'].map(fence_mapping).fillna(0)                    # Assuming NA means no fence
    df['Functional'] = df['Functional'].map(functional_mapping)
    df['BsmtQual'] = df['BsmtQual'].map(bsmt_qual_mapping).fillna(0)  # Assuming NA means no basement
    df['BsmtExposure'] = df['BsmtExposure'].map(bsmt_exposure_mapping).fillna(0)  # Assuming NA means no basement
    df['BsmtCond'] = df['BsmtCond'].map(bsmt_cond_mapping).fillna(0)  # Assuming NA means no basement

    # Combining Condition1 and Condition2 into a combined one-hot
    condition1_dummies = pd.get_dummies(df['Condition1'], prefix='Condition')
    condition2_dummies = pd.get_dummies(df['Condition2'], prefix='Condition')

    # Since we're interested in whether each condition is present regardless of being in Condition1 or Condition2,
    # we add the dummies together and clip the values to 1 to ensure binary representation
    combined_conditions = condition1_dummies.add(condition2_dummies, fill_value=0).clip(upper=1)
    df = df.drop(['Condition1', 'Condition2'], axis=1)
    df = df.join(combined_conditions)

    # FullBath and BsmtFullBath combined
    df['TotalFullBath'] = df['FullBath'] + df['BsmtFullBath']
    df.drop(['FullBath', 'BsmtFullBath'], axis=1, inplace=True)

    # BsmtHalfBath and HalfBath combined
    df['TotalHalfBath'] = df['HalfBath'] + df['BsmtHalfBath']
    df.drop(['HalfBath', 'BsmtHalfBath'], axis=1, inplace=True)

    # Adding binary features for certain areas
    for column in ['WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea']:
        df[f'{column}_Present'] = df[column].apply(lambda x: 1 if x > 0 else 0)

    # GarageYrBlt set missing values to 0
    df['GarageYrBlt'].fillna(0, inplace=True)

    # Summing square footage features
    df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF'] + df['GrLivArea']
    df.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea'], axis=1, inplace=True)

    # Impute LotFrontage with median
    imputer = SimpleImputer(strategy='median')
    df['LotFrontage'] = imputer.fit_transform(df[['LotFrontage']])

    # MoSold and YrSold to be combined into a single feature
    df['SaleDate'] = df['YrSold'] + df['MoSold'] / 12
    df.drop(['MoSold', 'YrSold'], axis=1, inplace=True)

    # CentralAir is binary
    df['CentralAir'] = df['CentralAir'].map({'Y': 1, 'N': 0})

    # Convert MasVnrArea null values to 0
    df['MasVnrArea'].fillna(0, inplace=True)

    return df


In [16]:
encoder = OneHotEncoder(handle_unknown='ignore')
df = clean(df, enc=encoder, fit_enc=True)
df.head()

ValueError: Shape of passed values is (1460, 1), indices imply (1460, 190)

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

In [None]:
# Check for non-numerical columns
non_numerical = df.select_dtypes(exclude=['number', 'bool']).columns
print(non_numerical)

In [None]:
test = clean(test, encoder, False)
col_means = test.mean()
test.fillna(col_means, inplace=True)
test.to_csv('../data/processed/test_cleaned.csv', index=False)
missing_values = test.isnull().sum()
missing_values = missing_values[missing_values > 0]
print(missing_values)

In [None]:
# Print the row number of each of the missing values
for col in test.columns:
    # Find rows where the column is null/NaN
    missing_rows = test.index[test[col].isnull()].tolist()
    
    # Print row numbers (indexes) for missing values, if any
    if missing_rows:
        print(f"Missing values in column '{col}': {missing_rows}")

In [None]:
print(df.shape)
print(test.shape)

In [None]:
# Print column names that don't match
print(set(df.columns) - set(test.columns))

In [None]:
# Save the cleaned dataframe
df.to_csv('../data/processed/train_cleaned.csv', index=False)
