In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVR

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.metrics import mean_squared_error

import functions as f
import classes as c

In [2]:
train_df = pd.read_csv("datasets/train.csv", na_values=np.NaN).set_index("Id")
test_df = pd.read_csv("datasets/test.csv", na_values=np.NaN).set_index("Id")
c.ConflictingDataCleaner(inplace=True).fit_transform(train_df)
corr_matrix = train_df.corr()
# train_df[pd.isnull(train_df)] = np.NaN
X = train_df.drop("SalePrice", axis=1).copy()
y = train_df["SalePrice"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
train_incomplete_col = f.columns_with_missing_values(train_df)
test_incomplete_col = f.columns_with_missing_values(test_df)
incomplete_col_diff = np.setdiff1d(train_incomplete_col, test_incomplete_col), np.setdiff1d(test_incomplete_col, train_incomplete_col)

drop_col = []

nominal_col = ["MSSubClass", "MSZoning", "LandContour", "LotConfig",
               "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle",
               "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType",
               "Foundation", "Heating", "Electrical", "GarageType", "Fence", "MiscFeature", 
              "SaleType", "SaleCondition"]
binary_col = ["Street", "CentralAir"]
ordinal_col = ["Alley", "LotShape", "Utilities", "LandSlope", "OverallQual", "OverallCond",
               "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
               "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
               "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC"]
count_col = ["BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
                "TotRmsAbvGrd", "Fireplaces", "GarageCars"]
interval_col = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
                "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea",
                "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                "ScreenPorch", "PoolArea", "MiscVal"]
time_col = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", "MoSold", "YrSold"]

cat_num_col = ["MSSubClass"]
miss_vall_col = ["LotFrontage", "MasVnrArea", "MasVnrType", "Electrical", "GarageYrBlt"]
test_miss_val_col = ['MSZoning', 'LotFrontage', 'Utilities', 'Exterior1st',
                     'Exterior2nd', 'MasVnrType', 'MasVnrArea','BsmtFinSF1', 'BsmtFinSF2',
                     'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'KitchenQual',
                     'Functional','GarageYrBlt', 'GarageCars', 'GarageArea', 'SaleType']
na_cat_col = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
                  "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
                  "GarageCond", "PoolQC", "Fence", "MiscFeature"]

new_futures = ["LotAreaSqrt", "GarageAreaSqrt", "AgeSold", "RemodAge"]

nominal_col_cats = f.unique_values(train_df[nominal_col])
binary_col_cats = f.unique_values(train_df[binary_col])
ordinal_col_cats = f.unique_values(train_df[ordinal_col])


all_initial_futures = nominal_col + binary_col + ordinal_col + count_col + interval_col + time_col
# one_hot_col = nominal_col + 
# label_enc_col = binary_col + ordinal_col
important_features = ['GrLivArea', 'BsmtQual_Ex', 'OverallQual_9', 'KitchenQual_Ex',
       'ExterQual_Ex', 'OverallCond_3', '2ndFlrSF',
       'Neighborhood_NoRidge', 'Functional_Typ', 'OverallCond_8',
       'OverallQual_8', 'TotalBsmtSF', 'Neighborhood_Crawfor',
       'Neighborhood_Edwards', '1stFlrSF', 'Neighborhood_NridgHt',
       'OverallCond_9', 'KitchenAbvGr', 'SaleCondition_Abnorml',
       'Neighborhood_StoneBr', 'BsmtFinSF1', 'Neighborhood_Timber',
       'OverallCond_7', 'Neighborhood_Mitchel', 'LotAreaSqrt',
       'Condition1_Artery', 'OverallQual_3', 'OverallCond_4',
       'Exterior1st_BrkFace', 'BsmtQual_TA', 'BsmtExposure_Gd',
       'BsmtQual_Gd', 'BldgType_1Fam', 'Functional_Maj1', 'OverallQual_5',
       'Condition1_Norm', 'GarageArea', 'OverallQual_6', 'OverallCond_5',
       'ExterQual_TA', 'BsmtFinType1_GLQ', 'MSZoning_C (all)',
       'BsmtHalfBath', 'Neighborhood_Somerst', 'OverallQual_10',
       'Neighborhood_Gilbert', 'OverallQual_4', 'BedroomAbvGr',
       'YearBuilt', 'AgeSold', 'KitchenQual_TA', 'Neighborhood_MeadowV',
       'Neighborhood_NPkVill', 'Condition1_RRAe', 'YearRemodAdd',
       'MasVnrArea', 'Neighborhood_CollgCr', 'GarageCars',
       'MasVnrType_BrkCmn', 'MSZoning_RL', 'SaleCondition_Partial',
       'Fireplaces', 'RoofStyle_Gable', 'MasVnrType_Stone',
       'LandContour_Low', 'SaleCondition_Alloca', 'KitchenQual_Fa',
       'ExterQual_Gd', 'BsmtExposure_No', 'GarageCond_Fa',
       'Condition2_Norm', 'FireplaceQu_Ex', 'LotConfig_CulDSac',
       'KitchenQual_Gd', 'LandSlope_Sev', 'MSSubClass_60',
       'BsmtFinType1_Rec', 'SaleType_New', 'PoolArea', 'MSSubClass_70',
       'Foundation_PConc', 'HouseStyle_1.5Unf', 'GarageQual_Ex',
       'GarageType_2Types', 'Neighborhood_Blmngtn', 'Neighborhood_NAmes',
       'Neighborhood_BrkSide', 'RoofStyle_Mansard', 'GarageAreaSqrt',
       'LotConfig_FR2', 'GarageType_BuiltIn', 'Exterior2nd_Brk Cmn',
       'Neighborhood_SWISU', 'Fence_MnPrv', 'SaleType_COD',
       'LandSlope_Gtl', 'BsmtFinType2_BLQ', 'MSZoning_FV',
       'Exterior1st_AsbShng', 'Neighborhood_OldTown',
       'Neighborhood_SawyerW', 'Neighborhood_NWAmes',
       'Exterior1st_Plywood', 'Exterior1st_Stucco', 'GarageQual_Fa',
       'MSSubClass_45', 'LandContour_HLS', 'BldgType_Duplex',
       'MSSubClass_90', 'BsmtCond_Fa', 'GarageCond_TA', 'LandContour_Bnk',
       'Functional_Min2', 'ExterQual_Fa', 'GarageQual_Gd',
       'BsmtFinType1_LwQ', 'BsmtExposure_Av', 'MSSubClass_40',
       'Exterior2nd_AsbShng', 'MSZoning_RM', 'GarageYrBlt',
       'HeatingQC_Ex', 'GarageQual_TA', 'SaleType_WD', 'RemodAge',
       'MSSubClass_160', 'Functional_Mod', 'RoofMatl_CompShg',
       'Exterior2nd_HdBoard', 'MSSubClass_190', 'Neighborhood_ClearCr',
       'SaleType_Con', 'Neighborhood_BrDale', 'Condition2_PosN',
       'OverallQual_1', 'RoofMatl_WdShngl', 'GarageCond_Ex',
       'Functional_Min1', 'MSSubClass_30', 'BldgType_TwnhsE',
       'Exterior2nd_Wd Sdng', 'Foundation_Slab', 'Condition1_PosN',
       'FireplaceQu_Fa', 'FireplaceQu_Po', 'RoofStyle_Hip',
       'Heating_Wall', 'ExterCond_Ex', 'Neighborhood_Veenker',
       'Condition2_Feedr']

In [4]:
count_interval_time_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=interval_col+new_futures)),
                ("standard", StandardScaler())
            ])
nominal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=nominal_col)),
                ("one_hot_encoder", c.DataFrameOneHotEncoder(columns=nominal_col, handle_unknown="ignore", categories=nominal_col_cats, sparse=False))
            ])
binary_ordinal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=binary_col+ordinal_col)),
                ("one_hot_encoder", c.DataFrameOneHotEncoder(columns=binary_col+ordinal_col, handle_unknown="ignore", categories=binary_col_cats+ordinal_col_cats, sparse=False))
            ])
missing_vallue_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=all_initial_futures, inplace=False)),
                ("most_frequent_imputer", c.DataFrameImputer(strategy="most_frequent", columns=[
                    "MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "KitchenQual",
                    "GarageCars", "SaleType"], inplace=True)),
                ("mean_imputer", c.DataFrameImputer(strategy="mean", columns=[
                    "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath",
                    "BsmtHalfBath", "GarageArea", ], inplace=True)),

                ("new_features_imputer", c.NewFeaturesImputer(inplace=True)),
                ("mas_vnr_imputer", c.ConstantImputer(columns=["MasVnrArea", "MasVnrType"], string_fill_val="None", inplace=True)),
                ("electrical_imputer", c.DataFrameImputer(strategy="most_frequent", columns=["Electrical"], inplace=True)),
                ("garage_yr_imputer", c.GarageYrImputer(inplace=True)),
                ("lot_frontage_imputer", c.LotFrontageImputer(inplace=True)),
                ("all_constant_imputer", c.ConstantImputer(columns=all_initial_futures+new_futures, inplace=True))
            ])
full_pip = Pipeline([
    ("missing_vallue", missing_vallue_pip),
    ("one_hot_encoder", c.DataFrameOneHotEncoder(columns=nominal_col+binary_col+ordinal_col, handle_unknown="ignore", categories=nominal_col_cats+binary_col_cats+ordinal_col_cats, sparse=False)),
    ("log_scaler", c.DataFrameLogScaler(columns=count_col)),
    ("scaler", c.DataFrameScaler(columns=interval_col + time_col + new_futures)),
    ("selector", c.DataFrameSelector(columns=important_features, inplace=True))
])


In [5]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVR(), tuned_parameters, cv=5, scoring="neg_mean_squared_error", verbose=1)

In [8]:
X_train_tr = full_pip.fit_transform(X)
X_test_tr = full_pip.transform(test_df)

  return self.partial_fit(X, y)
  scaled_matrix = self.scaler.transform(X[self.columns])
  scaled_matrix = self.scaler.transform(X[self.columns])


In [10]:
X_train_tr = full_pip.fit_transform(X)
X_test_tr = full_pip.transform(test_df)

clf.fit(X_train_tr, y)
print("Best parameters set found on development set:")
print(clf.best_params_)

  return self.partial_fit(X, y)
  scaled_matrix = self.scaler.transform(X[self.columns])
  scaled_matrix = self.scaler.transform(X[self.columns])
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=1)]: Done 140 out of 140 | elapsed:  3.0min finished


Best parameters set found on development set:
{'C': 1000, 'kernel': 'linear'}


In [11]:
model = clf.best_estimator_
y_train_pr = model.predict(X_train_tr)
y_test_pr = model.predict(X_test_tr)
mse_train = mean_squared_error(y, y_train_pr)
# mse_test = mean_squared_error(y_test, y_test_pr)
np.sqrt(mse_train)#, np.sqrt(mse_test)

30044.35199491915

In [None]:
f.columns_with_missing_values(test_df)

In [15]:
results = pd.DataFrame(y_test_pr, columns=["SalePrice"], index=test_df.index)

In [16]:
results.to_csv("results_v1.csv")