In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import Pipeline, FeatureUnion

import functions as f
import classes as c

In [2]:
train_df = pd.read_csv("datasets/train.csv").set_index("Id")
train_df[pd.isnull(train_df)] = np.NaN

X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
drop_col = []
nominal_col = ["MSSubClass", "MSZoning", "LandContour", "LotConfig",
               "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle",
               "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType",
               "Foundation", "Heating", "Electrical", "GarageType", "Fence", "MiscFeature", 
              "SaleType", "SaleCondition"]
binary_col = ["Street", "CentralAir"]
ordinal_col = ["Alley", "LotShape", "Utilities", "LandSlope", "OverallQual", "OverallCond",
               "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
               "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
               "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC"]
count_col = ["BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
                "TotRmsAbvGrd", "Fireplaces", "GarageCars"]
interval_col = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
                "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea",
                "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                "ScreenPorch", "PoolArea", "MiscVal"]
time_col = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", "MoSold", "YrSold"]
miss_vall_col = ["LotFrontage", "MasVnrArea", "MasVnrType", "Electrical"]

nominal_col_cats = f.unique_values(train_df[nominal_col])
binary_col_cats = f.unique_values(train_df[binary_col])
ordinal_col_cats = f.unique_values(train_df[ordinal_col])



In [None]:
f.lists_eq(train_df.columns, nominal_col+binary_col+ordinal_col+count_col+interval_col+time_col+["SalePrice"])

In [7]:
count_interval_time_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=interval_col+count_col+time_col)),
                ("standard", StandardScaler())
            ])
nominal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=nominal_col)),
                ("one_hot_encoder", OneHotEncoder(categories=nominal_col_cats, sparse=False))
            ])
binary_ordinal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=binary_col+ordinal_col)),
                ("one_hot_encoder", OneHotEncoder(categories=binary_col_cats+ordinal_col_cats, sparse=False))
            ])
missing_vallue_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=X_train.columns)),
                ("mas_vnr_imputer", c.MasVnrImputer()),
                ("mean_imputer", c.DataFrameImputer(strategy="mean", columns=["LotFrontage"])),
                ("most_frequent_imputer", c.DataFrameImputer(strategy="most_frequent", columns=["Electrical"])),
                ("const_imputer", c.ConstantImputer(columns=nominal_col+binary_col+ordinal_col)),

            ])


In [8]:
full_pip = Pipeline([
    ("missing_vallue", missing_vallue_pip),
    ("feature_union", FeatureUnion([
        ("count_interval_time", count_interval_time_pip),
        ("nominal", nominal_pip),
        ("binary_ordinal_pip", binary_ordinal_pip)]))
])

In [20]:
X_train_tr = full_pip.fit_transform(X_train)
X_test_tr = full_pip.transform(X_test)

ValueError: Must pass DataFrame with boolean values only

In [11]:
missing_vallue_pip.fit_transform(X_train)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
616,85,RL,80.000000,8800,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,5,2010,WD,Abnorml
614,20,RL,70.000000,8402,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,12,2007,New,Partial
1304,20,RL,73.000000,8688,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Normal
487,20,RL,79.000000,10289,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2007,WD,Normal
562,20,RL,77.000000,10010,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Normal
309,30,RL,70.284277,12342,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,3,2009,WD,Normal
462,70,RL,60.000000,7200,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,4,2009,WD,Normal
1143,60,RL,77.000000,9965,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2007,New,Partial
731,120,RL,39.000000,5389,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,3,2010,WD,Normal
1156,20,RL,90.000000,10768,Pave,,IR1,Lvl,AllPub,Corner,...,180,0,,,,0,7,2007,WD,Normal


In [19]:
X_train.select_dtypes(include=object).columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [17]:
X_train

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
616,85,RL,80.0,8800,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,5,2010,WD,Abnorml
614,20,RL,70.0,8402,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,12,2007,New,Partial
1304,20,RL,73.0,8688,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Normal
487,20,RL,79.0,10289,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2007,WD,Normal
562,20,RL,77.0,10010,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Normal
309,30,RL,,12342,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,3,2009,WD,Normal
462,70,RL,60.0,7200,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,4,2009,WD,Normal
1143,60,RL,77.0,9965,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2007,New,Partial
731,120,RL,39.0,5389,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,3,2010,WD,Normal
1156,20,RL,90.0,10768,Pave,,IR1,Lvl,AllPub,Corner,...,180,0,,,,0,7,2007,WD,Normal
