In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVR

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.metrics import mean_squared_error

import functions as f
import classes as c

In [None]:
train_df = pd.read_csv("datasets/train.csv", na_values=np.NaN).set_index("Id")
test_df = pd.read_csv("datasets/test.csv", na_values=np.NaN).set_index("Id")
corr_matrix = train_df.corr()
# train_df[pd.isnull(train_df)] = np.NaN
X = train_df.drop("SalePrice", axis=1).copy()
y = train_df["SalePrice"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
train_incomplete_col = f.columns_with_missing_values(train_df)
test_incomplete_col = f.columns_with_missing_values(test_df)
incomplete_col_diff = np.setdiff1d(train_incomplete_col, test_incomplete_col), np.setdiff1d(test_incomplete_col, train_incomplete_col)

drop_col = []

nominal_col = ["MSSubClass", "MSZoning", "LandContour", "LotConfig",
               "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle",
               "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType",
               "Foundation", "Heating", "Electrical", "GarageType", "Fence", "MiscFeature", 
              "SaleType", "SaleCondition"]
binary_col = ["Street", "CentralAir"]
ordinal_col = ["Alley", "LotShape", "Utilities", "LandSlope", "OverallQual", "OverallCond",
               "ExterQual", "ExterCond", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
               "BsmtFinType2", "HeatingQC", "KitchenQual", "Functional", "FireplaceQu",
               "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", "PoolQC"]
count_col = ["BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
                "TotRmsAbvGrd", "Fireplaces", "GarageCars"]
interval_col = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
                "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea",
                "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch",
                "ScreenPorch", "PoolArea", "MiscVal"]
time_col = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", "MoSold", "YrSold"]

cat_num_col = ["MSSubClass"]
miss_vall_col = ["LotFrontage", "MasVnrArea", "MasVnrType", "Electrical", "GarageYrBlt"]
na_cat_col = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1",
                  "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual",
                  "GarageCond", "PoolQC", "Fence", "MiscFeature"]

nominal_col_cats = f.unique_values(train_df[nominal_col])
binary_col_cats = f.unique_values(train_df[binary_col])
ordinal_col_cats = f.unique_values(train_df[ordinal_col])

# one_hot_col = nominal_col + 
# label_enc_col = binary_col + ordinal_col

In [11]:
ohe = OneHotEncoder(categories="auto" ,sparse=False).fit(train_df[["MSSubClass"]])

In [12]:
ohe.transform(train_df[["MSSubClass"]]).shape

(1460, 15)

In [None]:
num_to_cat = c.NumToCat(columns=["MSSubClass"]).fit(train_df)

In [None]:
train_df.loc[train_df["MasVnrType"] == "BrkCmn", "MasVnrArea"].hist()

In [None]:
corr_matrix["MasVnrArea"].sort_values(ascending=False)

In [None]:
const_imputer = c.ConstantImputer(columns=na_cat_col, inplace=True).fit(train_df)

In [None]:
const_imputer.transform(train_df)

In [None]:
mas_vnr_imputer = c.MasVnrImputer().fit(train_df)
mas_vnr_imputer.transform(train_df)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

In [None]:
model = SVR(kernel="linear")

In [None]:
mse_train = mean_squared_error(y_train, y_train_pr)

In [None]:
train_df = train_df[["LotArea", "GarageArea", "GarageCars", "LotFrontage"]].copy()
train_df.loc[:, "LotAreaSqrt"] = np.sqrt(train_df["LotArea"].values)
train_df.loc[:, "GarageAreaSqrt"] = np.sqrt(train_df["GarageArea"].values)
train_df.dropna(axis=0, inplace=True)
X = train_df.drop("LotFrontage", axis=1).copy()
y = train_df["LotFrontage"].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
model.fit(X_train, y_train)

In [None]:
y_train_pr = model.predict(X_train)
y_test_pr = model.predict(X_test)
mse_train = mean_squared_error(y_train, y_train_pr)
mse_test = mean_squared_error(y_test, y_test_pr)

In [None]:
y_test_pr[:10], y_test.values[:10]

In [None]:
corr_matrix["LotFrontage"].sort_values(ascending=False)

In [None]:
plt.scatter(x=train_df["LotFrontage"], y=np.sqrt(train_df["LotArea"]), alpha=0.4)

In [None]:
np.correlate(train_df["LotFrontage"].values, np.sqrt(train_df["LotArea"].values))

In [None]:
count_interval_time_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=interval_col)),
                ("standard", StandardScaler())
            ])
nominal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=nominal_col)),
                ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", categories=nominal_col_cats, sparse=False))
            ])
binary_ordinal_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=binary_col+ordinal_col)),
                ("one_hot_encoder", OneHotEncoder(handle_unknown="ignore", categories=binary_col_cats+ordinal_col_cats, sparse=False))
            ])
missing_vallue_pip = Pipeline([
                ("selector", c.DataFrameSelector(columns=X.columns)),
                ("na_cat_imputer", c.NaCatImputer(columns=na_cat_col)),
                ("mas_vnr_imputer", c.MasVnrImputer()),
                ("mean_imputer", c.DataFrameImputer(strategy="mean", columns=["LotFrontage"])),
                ("most_frequent_imputer", c.DataFrameImputer(strategy="most_frequent", columns=["Electrical"])),
                ("const_imputer", c.ConstantImputer(columns=nominal_col+binary_col+ordinal_col)),

            ])


In [None]:
full_pip = Pipeline([
    ("missing_vallue", missing_vallue_pip),
    ("feature_union", FeatureUnion([
        ("count_interval_time", count_interval_time_pip),
        ("nominal", nominal_pip),
        ("binary_ordinal_pip", binary_ordinal_pip)]))
])

In [None]:
X_train_tr = full_pip.fit_transform(X_train)
X_test_tr = full_pip.transform(X_test)

In [None]:
model = SVR(kernel="linear")
model.fit(X_train_tr, y_train)

In [None]:
y_train_pr = model.predict(X_train_tr)
y_test_pr = model.predict(X_test_tr)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
mse_train = mean_squared_error(y_train, y_train_pr)

In [None]:
mse_test = mean_squared_error(y_test, y_test_pr)