In [1]:
# Library imports, including MinMaxScaler from SKLearn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

In [2]:
# Max columns and rows for easier viewing of certain operations below

# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [3]:
# Read in cleaned CSVs into two different dataframes

df_train_cleaned = pd.read_csv('../data/train_cleaned.csv')
df_test_cleaned = pd.read_csv('../data/test_cleaned.csv')

In [4]:
# View header of train_cleaned dataframe to get an idea of the data and columns

df_train_cleaned.head()

Unnamed: 0,Id,lot_area,street,land_cont,lot_config,neighborhood,cond_1,cond_2,bldg_type,style,...,porch_5,misc_val,year_sold,ms_subclass,lot_shape,sale_price,has_pool,functional_mapped,bsmt_baths,porch_sf
0,109,13517,Pave,Lvl,CulDSac,Sawyer,RRA,Norm,1Fam,2Story,...,0,0,2010,60,IR1,130500,0,6,0.0,44
1,544,11492,Pave,Lvl,CulDSac,SawyerW,Norm,Norm,1Fam,2Story,...,0,0,2009,60,IR1,220000,0,6,1.0,74
2,153,7922,Pave,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,...,0,0,2010,20,Reg,109000,0,6,1.0,52
3,318,9802,Pave,Lvl,Inside,Timber,Norm,Norm,1Fam,2Story,...,0,0,2010,60,Reg,174000,0,6,0.0,100
4,255,14235,Pave,Lvl,Inside,SawyerW,Norm,Norm,1Fam,Fin,...,0,0,2010,50,IR1,138500,0,6,0.0,59


After importing the libraries and reading in the data, I ran descriptive statistics again (below) for all numerical columns. Considering I was going to log several features, I needed to first scale any categories to remove 0 values. Using the .describe() below I was able to easily identify which features I would need to scale. I chose to use the MinMax Scaler instead of the StandardScaler since the StandardScaler assumes the data follows a normal distribution which not all my data did.

In [5]:
# Take a look at the descriptive statistics for the training dataset to understand which values
# I would need to scale in order to perform meaningful transformations or logs

df_train_cleaned.describe()

Unnamed: 0,Id,lot_area,overall_qual,overall_cond,yr_built,yr_remodeled,bsmt_fin_1_sf,bsmt_fin_2_sf,bsmt_sf,gr_liv_area,...,porch_4,porch_5,misc_val,year_sold,ms_subclass,sale_price,has_pool,functional_mapped,bsmt_baths,porch_sf
count,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,...,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0,2049.0
mean,1473.676428,10024.677892,6.116642,5.569546,1971.673499,1984.166423,437.804783,47.98243,1053.035627,1495.553441,...,2.593948,16.527574,43.327965,2007.775988,57.059541,181479.018058,0.003904,5.845778,0.45754,182.494388
std,844.248064,6608.511749,1.399135,1.083961,30.171293,21.032785,439.842668,165.037776,426.81442,485.783577,...,25.241801,57.39989,434.481342,1.312533,42.868683,79295.913255,0.062378,0.658948,0.517896,158.936345
min,1.0,1300.0,3.0,3.0,1872.0,1950.0,0.0,0.0,0.0,334.0,...,0.0,0.0,0.0,2006.0,20.0,12789.0,0.0,1.0,0.0,0.0
25%,753.0,7500.0,5.0,5.0,1953.0,1964.0,0.0,0.0,793.0,1128.0,...,0.0,0.0,0.0,2007.0,20.0,129800.0,0.0,6.0,0.0,49.0
50%,1481.0,9405.0,6.0,5.0,1974.0,1993.0,368.0,0.0,994.0,1444.0,...,0.0,0.0,0.0,2008.0,50.0,162500.0,0.0,6.0,0.0,164.0
75%,2199.0,11500.0,7.0,6.0,2001.0,2004.0,733.0,0.0,1317.0,1728.0,...,0.0,0.0,0.0,2009.0,70.0,214000.0,0.0,6.0,1.0,267.0
max,2930.0,159000.0,10.0,9.0,2010.0,2010.0,2188.0,1474.0,3206.0,3820.0,...,508.0,490.0,12500.0,2010.0,190.0,611657.0,1.0,6.0,3.0,1424.0


In the following cell I performed all of my feature engineering operations. I initially started with performing a linear shift for certain columns so that I could either apply a log to them or use them in an interaction term. The reason I just performed a linear shift on this data is because they were all values that were less than ten and were hard counts of each feature. I then used the MinMaxScaler on features that I wanted to log or interact with that were also much more expansive in their ranges or may have outliers. I then created interaction term categories by multiplying different categories that I thought might work well together. Lastly, I logged features with a large spread or possible outliers to examine when testing my model in the next notebook.

In [6]:
# Feature engineering function
# https://www.analyticsvidhya.com/blog/2020/07/types-of-feature-transformation-and-scaling/
# https://stackoverflow.com/questions/45554008/error-in-python-script-expected-2d-array-got-1d-array-instead

def feat_eng(df):
    # Features for linear shift to get rid of 0s
    df['full_bath_shift'] = df['full_bath'] + 1
    df['half_bath_shift'] = df['half_bath'] + 1
    df['bedrooms_gr_shift'] = df['bedrooms_gr'] + 1
    df['fireplaces_shift'] = df['fireplaces'] + 1
    df['garage_car_size_shift'] = df['garage_car_size'] + 1
        
    # Features to MinMaxScale (1, 2)
    scaler = MinMaxScaler(feature_range=(1, 2))
    df['lot_area_scaled_mm'] = scaler.fit_transform(df[['lot_area']])
    df['bsmt_fin_1_sf_scaled_mm'] = scaler.fit_transform(df[['bsmt_fin_1_sf']])
    df['bsmt_fin_2_sf_scaled_mm'] = scaler.fit_transform(df[['bsmt_fin_2_sf']])
    df['bsmt_sf_scaled_mm'] = scaler.fit_transform(df[['bsmt_sf']])
    df['gr_liv_area_scaled_mm'] = scaler.fit_transform(df[['gr_liv_area']])
    
    # Feature engineering / multiplying/adding columns
    df['tot_rooms_gr_gr_liv_area'] = df['tot_rooms_gr'] * df['gr_liv_area']
    df['tot_rooms_gr_bedroom_gr'] = df['tot_rooms_gr'] * (df['bedrooms_gr_shift'])
    df['full_bath_gr_liv_area'] = (df['full_bath_shift']) * df['gr_liv_area']
    df['gar_car_size_overall_qual'] = df['garage_car_size_shift'] * df['overall_qual']
    df['gar_car_size_overall_qual'] = df['garage_car_size_shift'] * df['overall_qual']
    df['yr_built_overall_qual'] = df['yr_built'] * df['overall_qual']
    df['bsmt_sf_gr_liv_area'] = df['bsmt_sf'] + df['gr_liv_area']
    df['functional_mapped_overall_qual'] = df['functional_mapped'] * df['overall_qual']
    
    # Features to convert to logs
    df['full_bath_gr_liv_area_log'] = df['full_bath_gr_liv_area'].map(np.log)
    df['tot_rooms_gr_bedroom_gr_log'] = df['tot_rooms_gr_bedroom_gr'].map(np.log)
    df['lot_area_scaled_mm_log'] = df['lot_area_scaled_mm'].map(np.log)
    df['gr_liv_area_scaled_mm_log'] = df['gr_liv_area_scaled_mm'].map(np.log)
    df['bsmt_fin_1_sf_scaled_mm_log'] = df['bsmt_fin_1_sf_scaled_mm'].map(np.log)
    df['bsmt_fin_2_sf_scaled_mm_log'] = df['bsmt_fin_2_sf_scaled_mm'].map(np.log)
    df['bsmt_sf_scaled_mm_log'] = df['bsmt_sf_scaled_mm'].map(np.log)
    df['bedrooms_gr_shift_log'] = df['bedrooms_gr_shift'].map(np.log)
    df['tot_rooms_gr_log'] = df['tot_rooms_gr'].map(np.log)
    df['tot_rooms_gr_gr_liv_area_log'] = df['tot_rooms_gr_gr_liv_area'].map(np.log)
    df['lot_area_log'] = df['lot_area'].map(np.log)
    df['gr_liv_area_log'] = df['gr_liv_area'].map(np.log)
    
    return df

In [7]:
# Run my training data through the feat_eng() function and view head

df_train_cleaned_eng = feat_eng(df_train_cleaned)
df_train_cleaned_eng.head()

Unnamed: 0,Id,lot_area,street,land_cont,lot_config,neighborhood,cond_1,cond_2,bldg_type,style,...,lot_area_scaled_mm_log,gr_liv_area_scaled_mm_log,bsmt_fin_1_sf_scaled_mm_log,bsmt_fin_2_sf_scaled_mm_log,bsmt_sf_scaled_mm_log,bedrooms_gr_shift_log,tot_rooms_gr_log,tot_rooms_gr_gr_liv_area_log,lot_area_log,gr_liv_area_log
0,109,13517,Pave,Lvl,CulDSac,Sawyer,RRA,Norm,1Fam,2Story,...,0.074616,0.284018,0.218012,0.0,0.20387,1.386294,1.791759,9.090881,9.511703,7.299121
1,544,11492,Pave,Lvl,CulDSac,SawyerW,Norm,Norm,1Fam,2Story,...,0.062626,0.414034,0.25552,0.0,0.250586,1.609438,2.079442,9.739556,9.349406,7.660114
2,153,7922,Pave,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,...,0.041133,0.18847,0.288253,0.0,0.284949,1.386294,1.609438,8.572628,8.977399,6.96319
3,318,9802,Pave,Lvl,Inside,Timber,Norm,Norm,1Fam,2Story,...,0.052509,0.276431,0.0,0.0,0.113128,1.386294,1.94591,9.221082,9.190342,7.275172
4,255,14235,Pave,Lvl,Inside,SawyerW,Norm,Norm,1Fam,Fin,...,0.078832,0.276649,0.0,0.0,0.191326,1.386294,1.791759,9.067624,9.563459,7.275865


In [8]:
# Run my test data through the feat_eng() function and view head

df_test_cleaned_eng = feat_eng(df_test_cleaned)
df_test_cleaned_eng.head()

Unnamed: 0,Id,lot_area,street,land_cont,lot_config,neighborhood,cond_1,cond_2,bldg_type,style,...,lot_area_scaled_mm_log,gr_liv_area_scaled_mm_log,bsmt_fin_1_sf_scaled_mm_log,bsmt_fin_2_sf_scaled_mm_log,bsmt_sf_scaled_mm_log,bedrooms_gr_shift_log,tot_rooms_gr_log,tot_rooms_gr_gr_liv_area_log,lot_area_log,gr_liv_area_log
0,2658,9142,Pave,Lvl,Inside,OldTown,Norm,Norm,2fmCon,2Story,...,0.035229,0.317582,0.0,0.0,0.327743,1.609438,2.197225,9.761463,9.120634,7.564238
1,2718,9662,Pave,Lvl,Inside,Sawyer,Norm,Norm,Duplex,1Story,...,0.037574,0.324535,0.0,0.0,0.55842,1.94591,2.302585,9.88685,9.175956,7.584265
2,2414,17104,Pave,Lvl,Inside,Gilbert,Norm,Norm,1Fam,2Story,...,0.070554,0.237152,0.21683,0.0,0.222078,1.386294,1.94591,9.25646,9.747068,7.31055
3,1989,8520,Pave,Lvl,Inside,OldTown,Norm,Norm,1Fam,1Story,...,0.032416,0.12916,0.0,0.0,0.313394,1.098612,1.609438,8.48467,9.050172,6.875232
4,625,9500,Pave,Lvl,Inside,NAmes,Norm,Norm,1Fam,1Story,...,0.036844,0.217178,0.235998,0.0,0.425293,1.386294,1.791759,9.031692,9.159047,7.239933


After running my training and testing data through the feat_eng() function I saved the output to new CSV files.

In [9]:
# Export engineered data to CSVs for model selection and tuning

df_test_cleaned.to_csv('../data/test_cleaned_eng.csv', index=False)
df_train_cleaned.to_csv('../data/train_cleaned_eng.csv', index=False)