In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.pipeline import Pipeline, FeatureUnion

import functions as f
import classes as c

In [None]:
train_df = pd.read_csv("datasets/train.csv").set_index("Id")
train_df[pd.isnull(train_df)] = np.NaN

X = train_df.drop("SalePrice", axis=1)
y = train_df["SalePrice"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
drop_col = []
nominal_col = ["MSSubClass", "MSZoning", "LandContour", "LotConfig",
               "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle",
               "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType",
               "Foundation", "Heating", "Electrical", "GarageType", "Fence", "MiscFeature", 
              "SaleType", "SaleCondition"]
binary_col = ["Street", "CentralAir"]
ordinal_col = ["Alley", "LotShape", "Utilities", "LandSlope", "OverallQual", "OverallCond",
               "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
               "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
               "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC"]
count_col = ["BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
                "TotRmsAbvGrd", "Fireplaces", "GarageCars"]
interval_col = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
                "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea",
                "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                "ScreenPorch", "PoolArea", "MiscVal"]
time_col = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", "MoSold", "YrSold"]
miss_vall_col = ["LotFrontage", "MasVnrArea", "MasVnrType", "Electrical"]

In [None]:
f.lists_eq(train_df.columns, nominal_col+binary_col+ordinal_col+count_col+interval_col+time_col+["SalePrice"])

In [None]:
count_interval_time_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=interval_col+count_col+time_col)),
                ("standard", StandardScaler())
            ])
nominal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=nominal_col)),
                ("imputer", c.DataFrameImputer(strategy="constant", columns=nominal_col)),
                ("one_hot_encoder", OneHotEncoder(sparse=False))
            ])
binary_ordinal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=binary_col+ordinal_col)),
                ("imputer", c.DataFrameImputer(strategy="constant", columns=binary_col+ordinal_col)),
                ("one_hot_encoder", OneHotEncoder(sparse=False))
            ])
missing_vallue_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=X_train.columns)),
                ("mas_vnr_imputer", c.MasVnrImputer()),
                ("mean_imputer", c.DataFrameImputer(strategy="mean", columns=["LotFrontage"])),
                ("most_frequent_imputer", c.DataFrameImputer(strategy="most_frequent", columns=["Electrical"]))
            ])


In [None]:
full_pip = Pipeline([
    ("missing_vallue", missing_vallue_pip),
    ("feature_union", FeatureUnion([
        ("count_interval_time", count_interval_time_pip),
        ("nominal", nominal_pip),
        ("binary_ordinal_pip", binary_ordinal_pip)]))
])

In [None]:
full_pip.fit_transform(X_train)

In [None]:
count_interval_time_pip.fit_transform(train_df)