### Purpose of this workbook is to prepare the pandas dataframes for a preprocessing pipeline
Conclusion that surprisingly removing outliers improved LB score. Of all the added features, only polynomials seemed to matter

In [1]:
import pandas as pd
import numpy as np
import pickle

## Remove outliers

In [2]:
from sklearn.neighbors import LocalOutlierFactor
def remove_outliers(train,y_col):
    lcf = LocalOutlierFactor()
    corr = train.corr()[y_col]
    corr = corr.sort_values(ascending=False)
    num_x = list(corr.index)[1:]
    lof_df = train[num_x].fillna(train[num_x].median())
    pred =lcf.fit_predict(lof_df)
    train = train[pred==1]
    return train

## Get dataset

In [3]:
#make train and test datasets. Splitting labels and features happens later
def get_dataset():
    path_train = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/train.csv"
    path_test = "https://raw.githubusercontent.com/jvanelteren/housing/master/datasets/test.csv"

    train = pd.read_csv(path_train)
    final_test = pd.read_csv(path_test)
    print(f'train shape{train.shape}\ntest shape {final_test.shape}\n')

    y_col = (set(train.columns) - set(final_test.columns)).pop()

    train = remove_outliers(train,y_col)
    train = train[train.GrLivArea < 4000]
    train_x = train.drop([y_col], axis=1)
    train_y = train.loc[:,y_col]
    train_x.shape, train_y.shape
    print(f'train_x shape{train_x.shape}\ntrain_y shape {train_y.shape}')
    return train_x, train_y, final_test
train_x, train_y, final_test = get_dataset()

train shape(1460, 81)
test shape (1459, 80)

train_x shape(1414, 80)
train_y shape (1414,)


## Converting Y to Log

In [4]:
train_y = np.log1p(train_y)

## Remove outliers

## Order categoricals

In [280]:

for df in (train_x,final_test): 
    df.replace({"Alley" : {"Grvl" : 1, "Pave" : 2},
                       "BsmtCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "BsmtExposure" : {"No" : 0, "Mn" : 1, "Av": 2, "Gd" : 3},
                       "BsmtFinType1" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtFinType2" : {"No" : 0, "Unf" : 1, "LwQ": 2, "Rec" : 3, "BLQ" : 4, 
                                         "ALQ" : 5, "GLQ" : 6},
                       "BsmtQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA": 3, "Gd" : 4, "Ex" : 5},
                       "ExterCond" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "ExterQual" : {"Po" : 1, "Fa" : 2, "TA": 3, "Gd": 4, "Ex" : 5},
                       "FireplaceQu" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "Functional" : {"Sal" : 1, "Sev" : 2, "Maj2" : 3, "Maj1" : 4, "Mod": 5, 
                                       "Min2" : 6, "Min1" : 7, "Typ" : 8},
                       "GarageCond" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "GarageQual" : {"No" : 0, "Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "HeatingQC" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "KitchenQual" : {"Po" : 1, "Fa" : 2, "TA" : 3, "Gd" : 4, "Ex" : 5},
                       "LandSlope" : {"Sev" : 1, "Mod" : 2, "Gtl" : 3},
                       "LotShape" : {"IR3" : 1, "IR2" : 2, "IR1" : 3, "Reg" : 4},
                       "PavedDrive" : {"N" : 0, "P" : 1, "Y" : 2},
                       "PoolQC" : {"No" : 0, "Fa" : 1, "TA" : 2, "Gd" : 3, "Ex" : 4},
                       "Street" : {"Grvl" : 1, "Pave" : 2},
                       "Utilities" : {"ELO" : 1, "NoSeWa" : 2, "NoSewr" : 3, "AllPub" : 4}},inplace=True)

## Add simplification features

In [281]:
ten_to_three = {1 : 1, 2 : 1, 3 : 1, # bad
                                                        4 : 2, 5 : 2, 6 : 2, # average
                                                        7 : 3, 8 : 3, 9 : 3, 10 : 3 # good
                                                        }
five_to_three = {1 : 1, # bad
                                                        2 : 1, 3 : 1, # average
                                                        4 : 2, 5 : 2 # good
                                                        }

eight_to_four =                                         {1 : 1, 2 : 1, # bad
                                                        3 : 2, 4 : 2, # major
                                                        5 : 3, 6 : 3, 7 : 3, # minor
                                                        8 : 4 # typical
                                                        }

six_to_three = {1 : 1, # unfinished
                                                            2 : 1, 3 : 1, # rec room
                                                            4 : 2, 5 : 2, 6 : 2 # living quarters
                                                            }

for df in (train_x,final_test):
    df["SimplOverallQual"] = df.OverallQual.replace(ten_to_three)
    df["SimplOverallCond"] = df.OverallCond.replace(ten_to_three)
    df["SimplPoolQC"] = df.PoolQC.replace(five_to_three)
    df["SimplGarageCond"] = df.GarageCond.replace(five_to_three)
    df["SimplGarageQual"] = df.GarageQual.replace(five_to_three)
    df["SimplFireplaceQu"] = df.FireplaceQu.replace(five_to_three)
    df["SimplFireplaceQu"] = df.FireplaceQu.replace(five_to_three)
    df["SimplFunctional"] = df.Functional.replace(eight_to_four)
    df["SimplKitchenQual"] = df.KitchenQual.replace(five_to_three)
    df["SimplHeatingQC"] = df.HeatingQC.replace(five_to_three)
    df["SimplBsmtFinType1"] = df.BsmtFinType1.replace(six_to_three)
    df["SimplBsmtFinType2"] = df.BsmtFinType2.replace(six_to_three)
    df["SimplBsmtCond"] = df.BsmtCond.replace(five_to_three)
    df["SimplBsmtQual"] = df.BsmtQual.replace(five_to_three)
    df["SimplExterCond"] = df.ExterCond.replace(five_to_three)
    df["SimplExterQual"] = df.ExterQual.replace(five_to_three)


## Add Totalizer features

In [282]:
for df in (train_x,final_test):    
    # Overall quality of the house
    df["OverallGrade"] = df["OverallQual"] * df["OverallCond"]
    # Overall quality of the garage
    df["GarageGrade"] = df["GarageQual"] * df["GarageCond"]
    # Overall quality of the exterior
    df["ExterGrade"] = df["ExterQual"] * df["ExterCond"]
    # Overall kitchen score
    df["KitchenScore"] = df["KitchenAbvGr"] * df["KitchenQual"]
    # Overall fireplace score
    df["FireplaceScore"] = df["Fireplaces"] * df["FireplaceQu"]
    # Overall garage score
    df["GarageScore"] = df["GarageArea"] * df["GarageQual"]
    # Overall pool score
    df["PoolScore"] = df["PoolArea"] * df["PoolQC"]
    # Simplified overall quality of the house
    df["SimplOverallGrade"] = df["SimplOverallQual"] * df["SimplOverallCond"]
    # Simplified overall quality of the exterior
    df["SimplExterGrade"] = df["SimplExterQual"] * df["SimplExterCond"]
    # Simplified overall pool score
    df["SimplPoolScore"] = df["PoolArea"] * df["SimplPoolQC"]
    # Simplified overall garage score
    df["SimplGarageScore"] = df["GarageArea"] * df["SimplGarageQual"]
    # Simplified overall fireplace score
    df["SimplFireplaceScore"] = df["Fireplaces"] * df["SimplFireplaceQu"]
    # Simplified overall kitchen score
    df["SimplKitchenScore"] = df["KitchenAbvGr"] * df["SimplKitchenQual"]
    # Total number of bathrooms
    df["TotalBath"] = df["BsmtFullBath"] + (0.5 * df["BsmtHalfBath"]) + \
    df["FullBath"] + (0.5 * df["HalfBath"])
    # Total SF for house (incl. basement)
    df["AllSF"] = df["GrLivArea"] + df["TotalBsmtSF"]
    # Total SF for 1st + 2nd floors
    df["AllFlrsSF"] = df["1stFlrSF"] + df["2ndFlrSF"]
    # Total SF for porch
    df["AllPorchSF"] = df["OpenPorchSF"] + df["EnclosedPorch"] + \
    df["3SsnPorch"] + df["ScreenPorch"]
    # Has masonry veneer or not
    df["HasMasVnr"] = df.MasVnrType.replace({"BrkCmn" : 1, "BrkFace" : 1, "CBlock" : 1, 
                                                "Stone" : 1, "None" : 0})
    # House completed before sale or not
    df["BoughtOffPlan"] = df.SaleCondition.replace({"Abnorml" : 0, "Alloca" : 0, "AdjLand" : 0, 
                                                        "Family" : 0, "Normal" : 0, "Partial" : 1})

## Add Polynomial features

In [283]:
for df in (train_x,final_test): 
    df["OverallQual-s2"] = df["OverallQual"] ** 2
    df["OverallQual-s3"] = df["OverallQual"] ** 3
    df["OverallQual-Sq"] = np.sqrt(df["OverallQual"])
    df["AllSF-2"] = df["AllSF"] ** 2
    df["AllSF-3"] = df["AllSF"] ** 3
    df["AllSF-Sq"] = np.sqrt(df["AllSF"])
    df["AllFlrsSF-2"] = df["AllFlrsSF"] ** 2
    df["AllFlrsSF-3"] = df["AllFlrsSF"] ** 3
    df["AllFlrsSF-Sq"] = np.sqrt(df["AllFlrsSF"])
    df["GrLivArea-2"] = df["GrLivArea"] ** 2
    df["GrLivArea-3"] = df["GrLivArea"] ** 3
    df["GrLivArea-Sq"] = np.sqrt(df["GrLivArea"])
    df["SimplOverallQual-s2"] = df["SimplOverallQual"] ** 2
    df["SimplOverallQual-s3"] = df["SimplOverallQual"] ** 3
    df["SimplOverallQual-Sq"] = np.sqrt(df["SimplOverallQual"])
    df["ExterQual-2"] = df["ExterQual"] ** 2
    df["ExterQual-3"] = df["ExterQual"] ** 3
    df["ExterQual-Sq"] = np.sqrt(df["ExterQual"])
    df["GarageCars-2"] = df["GarageCars"] ** 2
    df["GarageCars-3"] = df["GarageCars"] ** 3
    df["GarageCars-Sq"] = np.sqrt(df["GarageCars"])
    df["TotalBath-2"] = df["TotalBath"] ** 2
    df["TotalBath-3"] = df["TotalBath"] ** 3
    df["TotalBath-Sq"] = np.sqrt(df["TotalBath"])
    df["KitchenQual-2"] = df["KitchenQual"] ** 2
    df["KitchenQual-3"] = df["KitchenQual"] ** 3
    df["KitchenQual-Sq"] = np.sqrt(df["KitchenQual"])
    df["GarageScore-2"] = df["GarageScore"] ** 2
    df["GarageScore-3"] = df["GarageScore"] ** 3
    df["GarageScore-Sq"] = np.sqrt(df["GarageScore"])

## Categorify

In [284]:
corr = train_x.corr()
num_x = list(corr.index)

cat_x = [col for col in train_x.columns if train_x[col].dtype == 'object']
assert len(train_x.columns) == len(num_x) + len(cat_x)

In [285]:
cat_x_ind = [ind for ind,name in enumerate(train_x.columns) if name in cat_x] # the original indices
cat_x_ind = list(range(len(num_x),len(train_x.columns))) # in the pipeline categorical comes after numeric
assert len(cat_x_ind) == len(cat_x)

In [286]:
# for name in train_x.columns:
#     if name in cat_x:
#         train_x[name] = train_x[name].astype(str)
#         train_x[name].apply(str)

for df in (train_x,final_test):
    df[cat_x].apply(str)
    df[cat_x] = df[cat_x].astype('category')

## Skewness doesn't seem to help

In [287]:
# from scipy.stats import skew
# # Log transform of the skewed numerical features to lessen impact of outliers
# # Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
# # As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed
# skewness = train_x[num_x].apply(lambda x: skew(x))
# skewness = skewness[abs(skewness) > 0.5]
# print(str(skewness.shape[0]) + " skewed numerical features to log transform")
# skewed_features = skewness.index
# train_x[num_x][skewed_features] = np.log1p(train_x[num_x][skewed_features])

In [289]:
len(train_x.columns)

144

## Saving

In [290]:
def save(train_info, filename='engineered_datasets.pickle'):
    with open('output/'+filename, 'wb') as handle:
        pickle.dump(train_info, handle, protocol=pickle.HIGHEST_PROTOCOL)

train_info = (train_x, train_y, final_test, num_x, cat_x, cat_x_ind)
save(train_info)