In [None]:
import os
import sys
import logging
import re
import time

import pandas as pd
import numpy as np
import sklearn.metrics
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.preprocessing

import xgboost
import sklearn.svm
import sklearn.linear_model
import sklearn.ensemble
import sklearn.gaussian_process
import sklearn.kernel_ridge
import sklearn.tree

import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.offline

In [None]:
plotly.offline.init_notebook_mode(connected=True)
pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 160

log = logging.getLogger(name=__name__)
log.setLevel(logging.INFO)
logging.captureWarnings(True)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)

stream_handler.setFormatter(formatter)
log.addHandler(stream_handler)

log.info(f"Python version: {sys.version}")
log.info(f"Numpy version: {np.__version__}")
log.info(f"Pandas version: {pd.__version__}")
log.info(f"Scikit-learn version: {sklearn.__version__}")
log.info(f"Plotly version: {plotly.__version__}")

In [None]:
data_bn = "data"
data_dir = os.path.abspath(
    os.path.join(__name__, os.pardir, os.pardir, data_bn)
)

log.info(f"Data directory: {data_dir}")

train_bn = "train.csv"
test_bn = "test.csv"
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)

df_train = pd.read_csv(train_fn)
df_test = pd.read_csv(test_fn)

log.info(f"Training data shape: {df_train.shape}")
log.info(f"Test data shape: {df_test.shape}")

train_pts = df_train.shape[0]

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
y_col = "SalePrice"

In [None]:
log.info(f"Number of training dataset columns: {len(df_train.columns)}.")
df_train.columns[:10]

In [None]:
description_fn = os.path.join(data_dir, "data_description.txt")

with open(description_fn, 'r') as f:
    desc = [x for x in f]

In [None]:
feat_re = re.compile("\w+(?:)")

feat_search = []

for i, line in enumerate(desc):
    a = feat_re.match(line)
    if a:
        feat_search.append((i, a.group()))

In [None]:
desc[:10]

In [None]:
feat_search[:10]

In [None]:
cat_feats = []
cont_feats = []

STEP = 2

for i, couple in enumerate(feat_search[:-1]):
    if feat_search[i+1][0] - couple[0] > STEP:
        cat_feats.append(couple[1])
    else:
        cont_feats.append(couple[1])
        
if len(desc) - feat_search[-1][0] > STEP:
    cat_feats.append(feat_search[-1][1])
else:
    cont_feats.append(feat_search[-1][1])

In [None]:
cat_feats[:10]

In [None]:
cont_feats[:10]

In [None]:
try:
    cont_feats.remove("Kitchen")
    cont_feats.append("KitchenAbvGr")
except ValueError as e:
    log.error(e)
try:
    cont_feats.remove("Bedroom")
    cont_feats.append("BedroomAbvGr")
except ValueError as e:
    log.error(e)

In [None]:
UNK = "UNK"
RANK = "rank"
NUMBER = "number"
FREQUENCY = "frequency"


class OrdinalEncoderExt(sklearn.preprocessing.OrdinalEncoder):
    """
    """

    def __init__(self,
                 top_n=None, 
                 count_thresh=None, 
                 freq_thresh=None, 
                 categories="auto", 
                 **kwargs,
    ):
        """
        """
        super(OrdinalEncoderExt, self).__init__(
            categories=categories,
            **kwargs
        )
        if top_n is not None:
            self.criterion = RANK
            try:
                self.criterion_val = int(top_n)
            except ValueError as e:
                log.error(e)
        elif count_thresh is not None:
            self.criterion = NUMBER
            try:
                self.criterion_val = int(count_thresh)
            except ValueError as e:
                log.error(e)
        elif freq_thresh is not None:
            self.criterion = FREQUENCY
            try:
                self.criterion_val = float(freq_thresh)
            except ValueError as e:
                log.error(e)
        else:
            self.criterion = ""
            self.criterion_val = None


    def fit(self, X):
        """
        """
        try:
            X = np.array(X)
        except ValueError as e:
            log.error(e)
        assert (len(X.shape)==2), "Require 2D array"
        
        X = X.astype(str)
        
        Y = np.full(X.shape, "", dtype="U20")
        for j in range(X.shape[1]):
            unique_elem, elem_locs, elem_counts = (
                np.unique(
                    X[:, j],
                    return_inverse=True,
                    return_counts=True,
                )
            )

            if self.criterion == RANK:
                a = np.argpartition(elem_counts, self.criterion_val)
                for t in a:
                    Y[elem_locs[t], j] = unique_elem[t]
            elif self.criterion == NUMBER:
                for i, t in np.ndenumerate(elem_counts):
                    if t >= self.criterion_val:
                        Y[elem_locs[i], j] = unique_elem[i]
            elif self.criterion == FREQUENCY:
                for i, t in np.ndenumerate(elem_counts):
                    if t/x.shape[0] >= self.criterion_val:
                        Y[elem_locs[i], j] = unique_elem[i]
            else:
                Y[:, j] = X[:, j]
            Y[np.where(Y[:, j]==''), j] = UNK

        tmp = np.full(X.shape[1], UNK).reshape((1, -1))
        Y = np.append(Y, tmp, axis=0)

        super(OrdinalEncoderExt, self).fit(Y)
        
        return 0


    def transform(self, X):
        """
        """
        X = X.astype(str)
        for i in range(X.shape[1]):
            X[~np.isin(X[:, i], self.categories_[i]), i] = UNK
            
        return super(OrdinalEncoderExt, self).transform(X).astype(int)


    def fit_transform(self, X):
        """
        """
        self.fit(X)

        return self.transform(X)

In [None]:
train_cat_data = df_train[cat_feats]
train_cat_data.fillna("", inplace=True)

test_cat_data = df_test[cat_feats]
test_cat_data.fillna("", inplace=True)

enc = OrdinalEncoderExt()

In [None]:
train_cat_data.head(5)

In [None]:
test_cat_data.head(5)

In [None]:
train_cat_vals = (
    pd.DataFrame(
        enc.fit_transform(train_cat_data.values), 
        columns=train_cat_data.columns,
    )
)

test_cat_vals = pd.DataFrame(enc.transform(test_cat_data.values), columns=test_cat_data.columns)

In [None]:
train_cat_vals.head(5)

In [None]:
test_cat_vals.head(5)

In [None]:
cols = [
    f"{train_cat_data.columns[i]}_{x}" 
        for i, col in enumerate(enc.categories_) for x in col
]
train_bin_enc = pd.DataFrame()
test_bin_enc = pd.DataFrame()


for j, cat in enumerate(train_cat_vals.columns):
    for i, col in enumerate(enc.categories_[j]):
        train_bin_enc[f"{cat}_{col}"] = train_cat_vals[cat].apply(lambda x: 1 if x==i else 0)
        test_bin_enc[f"{cat}_{col}"] = test_cat_vals[cat].apply(lambda x: 1 if x==i else 0)

cat_scl = sklearn.preprocessing.StandardScaler()

train_bin_enc = pd.DataFrame(
    cat_scl.fit_transform(train_bin_enc.values),
    columns=train_bin_enc.columns,
)

test_bin_enc = pd.DataFrame(
    cat_scl.transform(test_bin_enc.values),
    columns=test_bin_enc.columns,
)

In [None]:
train_bin_enc.head(5)

In [None]:
test_bin_enc.head(5)

In [None]:
train_cont_data = df_train[cont_feats]
test_cont_data = df_test[cont_feats]

scl = sklearn.preprocessing.StandardScaler()

train_cont_vals = pd.DataFrame(
    scl.fit_transform(train_cont_data.values),
    columns=train_cont_data.columns,
)
test_cont_vals = pd.DataFrame(
    scl.transform(test_cont_data.values),
    columns=train_cont_data.columns,
)

In [None]:
train_cont_data.head(5)

In [None]:
test_cont_data.head(5)

In [None]:
train_cont_vals.head(5)

In [None]:
test_cont_vals.head(5)

In [None]:
X = train_cont_vals.join(train_bin_enc)
X.fillna(0, inplace=True)

log.info(f"Total number of independent features before projection: {X.shape[1]}")

X_test = test_cont_vals.join(test_bin_enc)
X_test.fillna(0, inplace=True)

In [None]:
X.head(5)

In [None]:
X_test.head(5)

In [None]:
import sklearn.decomposition 

n_components = 150

pca = sklearn.decomposition.PCA(
    n_components=n_components,
    svd_solver='randomized',
    whiten=True).fit(X)

X = pd.DataFrame(pca.transform(X))
X_test = pd.DataFrame(pca.transform(X_test))

In [None]:
X.head(5)

In [None]:
Y_train = df_train[[y_col]]

fig = go.Figure()

hist = go.Histogram(
    x=Y_train[y_col].values, 
    xbins=dict(
        start=0,
        end=Y_train[y_col].max(),
        size=10000,
    ),
)
fig.add_trace(hist)

fig.update_layout(
    go.Layout(
        xaxis = dict(
            rangeslider = {'visible': False},
        ),
    )
)
plotly.offline.iplot(fig)

In [None]:
log_y_col = "log_y"

Y_train[log_y_col] = np.log(Y_train[y_col])

fig = go.Figure()

hist = go.Histogram(
    x=Y_train[log_y_col].values, 
    xbins=dict(
        start=0,
        end=Y_train[log_y_col].max(),
        size=0.10,
    ),
)
fig.add_trace(hist)

fig.update_layout(
    go.Layout(
        xaxis = dict(
            rangeslider = {'visible': False},
        ),
    )
)
plotly.offline.iplot(fig)

In [None]:
y_scl = sklearn.preprocessing.StandardScaler()

Y = pd.DataFrame(y_scl.fit_transform(Y_train[[log_y_col]].values), columns=Y_train[[log_y_col]].columns)

In [None]:
Y.head(10)

In [None]:
Z = X.join(Y)

Z_corr = Z.corr()

In [None]:
#Z_corr

for col in Z.columns[:2]:
    if col not in (y_col, log_y_col):
        
        cor_x = np.linspace(Z[col].min(), Z[col].max(), 3)
        s = Z_corr.at[log_y_col, col]
        
        fig = go.Figure()
        
        trace = go.Scatter(x=Z[col], y=Z[log_y_col], mode="markers", text=Z.index)
        trace_cor = go.Scatter(x=cor_x, y=s*cor_x, mode="lines")
        
        fig.add_trace(trace)
        fig.add_trace(trace_cor)

        fig.update_layout(
            width=1400,
            height=1200,
            title=dict(text=f"{col} vs. {y_col}"),
            xaxis_title=dict(text=col),
            yaxis_title=dict(text=y_col),
        )
        plotly.offline.iplot(fig)

In [None]:
models = [
    sklearn.linear_model.LinearRegression(),
]

xgb_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        #n_estimators=[100, 110, 150],
        #max_depth=[5, 7],
        learning_rate=[None, 1e-4, 1e-2],
        booster=["gbtree", "gblinear", "dart"],
        reg_alpha=[None, 1e-5, 1e-3],
        reg_lambda=[None, 1e-5, 1e-3],
    )
)
models.extend(
    [xgboost.XGBRegressor(**params) for params in xgb_param_grid]
)

lsvr_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        C=[1, 2, 0.5],
    )
)
models.extend(
    [sklearn.svm.LinearSVR(**params) for params in lsvr_param_grid]
)

svr_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        C=[1, 2, 0.5],
        kernel=["linear", "poly", "rbf", "sigmoid"],
        gamma=["scale", 0.01],
    )
)
models.extend(
    [sklearn.svm.SVR(**params) for params in svr_param_grid]
)
        
kernel_ridge_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha=[1, 2, 1e-1],
        gamma=[None, 1, 0.1],
    )
)
models.extend(
    [sklearn.kernel_ridge.KernelRidge(**params) 
        for params in kernel_ridge_param_grid
    ]
)
        
elastic_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha=[2, 1, 0.5],
        l1_ratio=[0.5, 1, 0.1],
    )
)
models.extend(
    [sklearn.linear_model.ElasticNet(**params) 
        for params in elastic_param_grid
    ]
)

gauss_process_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        kernel=[
            None, 
            sklearn.gaussian_process.kernels.Matern(),
            sklearn.gaussian_process.kernels.DotProduct(),
            sklearn.gaussian_process.kernels.RationalQuadratic(),
        ],
    )
)
models.extend(
    [sklearn.gaussian_process.GaussianProcessRegressor(**params)
        for params in gauss_process_param_grid
    ]
)

elastic_cv_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        l1_ratio=[0.5, 0.1, 0.7, 0.9, 0.95, 1],
    )
)
models.extend(
    [sklearn.linear_model.ElasticNetCV(**params) 
        for params in elastic_cv_param_grid
    ]
)

gradient_boosting_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        n_estimators=[100, 150],
        criterion=["friedman_mse", "mse", "mae"],
        max_depth=[3, 5],
        max_features=[
            #"auto",
            "sqrt", "log2"],
    )
)
models.extend(
    [sklearn.ensemble.GradientBoostingRegressor(**params)
        for params in gradient_boosting_param_grid
    ]
)

random_forest_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        n_estimators=[100, 150],
        criterion=["mse", "mae"],
        max_depth=[None, 5],
        max_features=[
            #"auto",
            "sqrt", "log2"],
    )
)
models.extend(
    [sklearn.ensemble.RandomForestRegressor(**params)
        for params in random_forest_param_grid
    ]
)
    
ridge_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha=[1, 2, 0.5],
    )
)
models.extend(
    [sklearn.linear_model.Ridge(**params)
         for params in ridge_param_grid
    ]
)

bayes_adr_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha_1=[1e-6, 1e-5],
        alpha_2=[1e-6, 1e-5],
        lambda_1=[1e-6, 1e-5],
        lambda_2=[1e-6, 1e-5],
    )
)
models.extend(
    [sklearn.linear_model.ARDRegression(**params)
        for params in bayes_adr_param_grid
    ]
)

sgd_linear_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        eta0=[0.01, 0.005],
        power_t=[0.25, 0.2],
    )
)
models.extend(
    [sklearn.linear_model.SGDRegressor(**params)
        for params in sgd_linear_param_grid
    ]
)

ada_boost_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        base_estimator=[
            None,
            sklearn.tree.DecisionTreeRegressor(max_depth=4),
        ],
        loss=["linear", "square", "exponential"],
    )
)
models.extend(
    [sklearn.ensemble.AdaBoostRegressor(**params)
        for params in ada_boost_param_grid
    ]
)

bagging_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        n_estimators=[10, 20],
        max_features=[1.0, 0.2],
        bootstrap=[True, False],
    )
)
models.extend(
    [sklearn.ensemble.BaggingRegressor(**params)
        for params in bagging_param_grid
    ]
)

In [None]:
import warnings
warnings.filterwarnings("ignore")

pred_cols = [f"model_{i:03d}" for i in range(len(models))]
f = lambda x: np.exp(y_scl.inverse_transform(x))
    
scores = []

#splts = 2
#kf = sklearn.model_selection.KFold(n_splits=splts)
#kf_cols = [f"k{i//2}_scl" if i % 2 ==0 else f"k{i//2}" for i in range(2*splts)]
kf_cols = ["k0_scl", "k0"]

for model in models:
    t1 = time.perf_counter()
    row = []
    
    #for train_index, test_index in kf.split(X.values, Y.values):
    #    model.fit(X.loc[train_index], Y.loc[train_index])
    if True:
        X_train_split, X_test_split, y_train_split, y_test_split = sklearn.model_selection.train_test_split(X, Y, test_size=0.2, random_state=42)
        model.fit(X_train_split, y_train_split)
        
        pred = model.predict(X)
        r2 = sklearn.metrics.r2_score(Y[log_y_col], pred)
        try:
            r2t = sklearn.metrics.r2_score(f(Y[log_y_col]), f(pred))
        except ValueError as e:
            log.error(e)
            r2t = np.nan
        row.append(r2)
        row.append(r2t)
    t2 = time.perf_counter()
    row.append(t2-t1)
    scores.append(row)
    log.info(f"{str(model)[:15]} -- time elapsed: {t2-t1:5.3f}")

In [None]:
cv_df = pd.DataFrame(scores, index=pred_cols, columns=kf_cols+["time"])

cv_df["name"] = [str(model) for model in models]
cv_df['params'] = [model.get_params() for model in models]

In [None]:
cv_df.head(10)

In [None]:
cv_df.sort_values(by=['k0_scl'], ascending=False).head(20)

In [None]:
true_labels = pd.read_csv(os.path.join(data_dir, "true_submission.csv"))

true_labels[log_y_col] = np.log(true_labels["SalePrice"])

In [None]:
model_name = cv_df.sort_values(by=['k0'], ascending=False).index[128]
model = models[int(model_name[-3:])]

In [None]:
results = pd.DataFrame({"Id": df_test["Id"], "SalePrice": f(model.predict(X_test)).reshape((-1))})

In [None]:
results_fn = os.path.join(data_dir, "results.csv")
#results.to_csv(results_fn, index=False)

In [None]:
val = np.sqrt(
    sklearn.metrics.mean_squared_error(true_labels[log_y_col], np.log(results[y_col]))
)
log.info(f"RMSE of log: {val}")

In [None]:
for i in range(400):
    model_name = cv_df.sort_values(by=['k0'], ascending=False).index[i]
    model = models[int(model_name[-3:])]
    results = pd.DataFrame({"Id": df_test["Id"], "SalePrice": f(model.predict(X_test)).reshape((-1))})
    val = np.sqrt(
        sklearn.metrics.mean_squared_error(true_labels[log_y_col], np.log(results[y_col]))
    )
    log.info(f"{i} RMSE of log: {val}")

In [None]:
results