In [1]:
import os
import sys
import logging
import re
import time

import pandas as pd
import numpy as np
import sklearn.metrics
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.preprocessing

import xgboost
import sklearn.svm
import sklearn.linear_model
import sklearn.ensemble
import sklearn.gaussian_process
import sklearn.kernel_ridge
import sklearn.tree
import tensorflow as tf

import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.offline

In [2]:
plotly.offline.init_notebook_mode(connected=True)
pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_columns = None
pd.options.display.max_colwidth = 160

log = logging.getLogger(name=__name__)
log.setLevel(logging.INFO)
logging.captureWarnings(True)
formatter = logging.Formatter(
    '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)

stream_handler.setFormatter(formatter)
log.addHandler(stream_handler)

log.info(f"Python version: {sys.version}")
log.info(f"Numpy version: {np.__version__}")
log.info(f"Pandas version: {pd.__version__}")
log.info(f"Scikit-learn version: {sklearn.__version__}")
log.info(f"Plotly version: {plotly.__version__}")

2020-11-30 16:17:04,131 - __main__ - INFO - Python version: 3.8.5 (default, Jul 28 2020, 12:59:40) 
[GCC 9.3.0]
2020-11-30 16:17:04,134 - __main__ - INFO - Numpy version: 1.19.4
2020-11-30 16:17:04,136 - __main__ - INFO - Pandas version: 1.1.4
2020-11-30 16:17:04,138 - __main__ - INFO - Scikit-learn version: 0.23.2
2020-11-30 16:17:04,147 - __main__ - INFO - Plotly version: 4.13.0


In [3]:
data_bn = "data"
data_dir = os.path.abspath(
    os.path.join(__name__, os.pardir, os.pardir, data_bn)
)

log.info(f"Data directory: {data_dir}")

train_bn = "train.csv"
test_bn = "test.csv"
train_fn = os.path.join(data_dir, train_bn)
test_fn = os.path.join(data_dir, test_bn)

df_train = pd.read_csv(train_fn)
df_test = pd.read_csv(test_fn)

log.info(f"Training data shape: {df_train.shape}")
log.info(f"Test data shape: {df_test.shape}")

train_pts = df_train.shape[0]

2020-11-30 16:17:04,162 - __main__ - INFO - Data directory: /home/jamescorbin/github_projects/kaggle/house_prices_regression/data
2020-11-30 16:17:04,265 - __main__ - INFO - Training data shape: (1460, 81)
2020-11-30 16:17:04,266 - __main__ - INFO - Test data shape: (1459, 80)


In [4]:
df_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [5]:
df_test.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,,Attchd,1961.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,1958,1958,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,,Attchd,1958.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,1997,1998,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,1997.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,1998,1998,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,1998.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,1992,1992,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,,Attchd,1992.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,,,,0,1,2010,WD,Normal


In [6]:
y_col = "SalePrice"

In [7]:
log.info(f"Number of training dataset columns: {len(df_train.columns)}.")
df_train.columns[:10]

2020-11-30 16:17:04,554 - __main__ - INFO - Number of training dataset columns: 81.


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities'],
      dtype='object')

In [8]:
description_fn = os.path.join(data_dir, "data_description.txt")

with open(description_fn, 'r') as f:
    desc = [x for x in f]

In [9]:
feat_re = re.compile("\w+(?:)")

feat_search = []

for i, line in enumerate(desc):
    a = feat_re.match(line)
    if a:
        feat_search.append((i, a.group()))

In [10]:
desc[:10]

['MSSubClass: Identifies the type of dwelling involved in the sale.\t\n',
 '\n',
 '        20\t1-STORY 1946 & NEWER ALL STYLES\n',
 '        30\t1-STORY 1945 & OLDER\n',
 '        40\t1-STORY W/FINISHED ATTIC ALL AGES\n',
 '        45\t1-1/2 STORY - UNFINISHED ALL AGES\n',
 '        50\t1-1/2 STORY FINISHED ALL AGES\n',
 '        60\t2-STORY 1946 & NEWER\n',
 '        70\t2-STORY 1945 & OLDER\n',
 '        75\t2-1/2 STORY ALL AGES\n']

In [11]:
feat_search[:10]

[(0, 'MSSubClass'),
 (19, 'MSZoning'),
 (30, 'LotFrontage'),
 (32, 'LotArea'),
 (34, 'Street'),
 (39, 'Alley'),
 (45, 'LotShape'),
 (52, 'LandContour'),
 (59, 'Utilities'),
 (66, 'LotConfig')]

In [12]:
cat_feats = []
cont_feats = []

STEP = 2

for i, couple in enumerate(feat_search[:-1]):
    if feat_search[i+1][0] - couple[0] > STEP:
        cat_feats.append(couple[1])
    else:
        cont_feats.append(couple[1])
        
if len(desc) - feat_search[-1][0] > STEP:
    cat_feats.append(feat_search[-1][1])
else:
    cont_feats.append(feat_search[-1][1])

In [13]:
cat_feats[:10]

['MSSubClass',
 'MSZoning',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood']

In [14]:
cont_feats[:10]

['LotFrontage',
 'LotArea',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF']

In [15]:
try:
    cont_feats.remove("Kitchen")
    cont_feats.append("KitchenAbvGr")
except ValueError as e:
    log.error(e)
try:
    cont_feats.remove("Bedroom")
    cont_feats.append("BedroomAbvGr")
except ValueError as e:
    log.error(e)

In [16]:
UNK = "UNK"
RANK = "rank"
NUMBER = "number"
FREQUENCY = "frequency"


class OrdinalEncoderExt(sklearn.preprocessing.OrdinalEncoder):
    """
    """

    def __init__(self,
                 top_n=None, 
                 count_thresh=None, 
                 freq_thresh=None, 
                 categories="auto", 
                 **kwargs,
    ):
        """
        """
        super(OrdinalEncoderExt, self).__init__(
            categories=categories,
            **kwargs
        )
        if top_n is not None:
            self.criterion = RANK
            try:
                self.criterion_val = int(top_n)
            except ValueError as e:
                log.error(e)
        elif count_thresh is not None:
            self.criterion = NUMBER
            try:
                self.criterion_val = int(count_thresh)
            except ValueError as e:
                log.error(e)
        elif freq_thresh is not None:
            self.criterion = FREQUENCY
            try:
                self.criterion_val = float(freq_thresh)
            except ValueError as e:
                log.error(e)
        else:
            self.criterion = ""
            self.criterion_val = None


    def fit(self, X):
        """
        """
        try:
            X = np.array(X)
        except ValueError as e:
            log.error(e)
        assert (len(X.shape)==2), "Require 2D array"
        
        X = X.astype(str)
        
        Y = np.full(X.shape, "", dtype="U20")
        for j in range(X.shape[1]):
            unique_elem, elem_locs, elem_counts = (
                np.unique(
                    X[:, j],
                    return_inverse=True,
                    return_counts=True,
                )
            )

            if self.criterion == RANK:
                a = np.argpartition(elem_counts, self.criterion_val)
                for t in a:
                    Y[elem_locs[t], j] = unique_elem[t]
            elif self.criterion == NUMBER:
                for i, t in np.ndenumerate(elem_counts):
                    if t >= self.criterion_val:
                        Y[elem_locs[i], j] = unique_elem[i]
            elif self.criterion == FREQUENCY:
                for i, t in np.ndenumerate(elem_counts):
                    if t/x.shape[0] >= self.criterion_val:
                        Y[elem_locs[i], j] = unique_elem[i]
            else:
                Y[:, j] = X[:, j]
            Y[np.where(Y[:, j]==''), j] = UNK

        tmp = np.full(X.shape[1], UNK).reshape((1, -1))
        Y = np.append(Y, tmp, axis=0)

        super(OrdinalEncoderExt, self).fit(Y)
        
        return 0


    def transform(self, X):
        """
        """
        X = X.astype(str)
        for i in range(X.shape[1]):
            X[~np.isin(X[:, i], self.categories_[i]), i] = UNK
            
        return super(OrdinalEncoderExt, self).transform(X).astype(int)


    def fit_transform(self, X):
        """
        """
        self.fit(X)

        return self.transform(X)

In [17]:
train_cat_data = df_train[cat_feats]
train_cat_data.fillna("", inplace=True)

test_cat_data = df_test[cat_feats]
test_cat_data.fillna("", inplace=True)

enc = OrdinalEncoderExt()

In [18]:
train_cat_data.head(5)

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,60,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,20,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,Gable,CompShg,MetalSd,MetalSd,,TA,TA,CBlock,Gd,TA,Gd,ALQ,Unf,GasA,Ex,Y,SBrkr,TA,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,60,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Mn,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,70,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,Gable,CompShg,Wd Sdng,Wd Shng,,TA,TA,BrkTil,TA,Gd,No,ALQ,Unf,GasA,Gd,Y,SBrkr,Gd,Typ,Gd,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,60,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,Gable,CompShg,VinylSd,VinylSd,BrkFace,Gd,TA,PConc,Gd,TA,Av,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,TA,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [19]:
test_cat_data.head(5)

Unnamed: 0,MSSubClass,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Heating,HeatingQC,CentralAir,Electrical,KitchenQual,Functional,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,20,RH,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,Gable,CompShg,VinylSd,VinylSd,,TA,TA,CBlock,TA,TA,No,Rec,LwQ,GasA,TA,Y,SBrkr,TA,Typ,,Attchd,Unf,TA,TA,Y,,MnPrv,,WD,Normal
1,20,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,TA,TA,CBlock,TA,TA,No,ALQ,Unf,GasA,TA,Y,SBrkr,Gd,Typ,,Attchd,Unf,TA,TA,Y,,,Gar2,WD,Normal
2,60,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,Gable,CompShg,VinylSd,VinylSd,,TA,TA,PConc,Gd,TA,No,GLQ,Unf,GasA,Gd,Y,SBrkr,TA,Typ,TA,Attchd,Fin,TA,TA,Y,,MnPrv,,WD,Normal
3,60,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,Gable,CompShg,VinylSd,VinylSd,BrkFace,TA,TA,PConc,TA,TA,No,GLQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,Gd,Attchd,Fin,TA,TA,Y,,,,WD,Normal
4,120,RL,Pave,,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,Gable,CompShg,HdBoard,HdBoard,,Gd,TA,PConc,Gd,TA,No,ALQ,Unf,GasA,Ex,Y,SBrkr,Gd,Typ,,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [20]:
train_cat_vals = (
    pd.DataFrame(
        enc.fit_transform(train_cat_data.values), 
        columns=train_cat_data.columns,
    )
)

test_cat_vals = pd.DataFrame(enc.transform(test_cat_data.values), columns=test_cat_data.columns)

In [21]:
train_cat_vals.head(5)

In [22]:
test_cat_vals.head(5)

In [23]:
cols = [
    f"{train_cat_data.columns[i]}_{x}" 
        for i, col in enumerate(enc.categories_) for x in col
]
train_bin_enc = pd.DataFrame()
test_bin_enc = pd.DataFrame()


for j, cat in enumerate(train_cat_vals.columns):
    for i, col in enumerate(enc.categories_[j]):
        train_bin_enc[f"{cat}_{col}"] = train_cat_vals[cat].apply(lambda x: 1 if x==i else 0)
        test_bin_enc[f"{cat}_{col}"] = test_cat_vals[cat].apply(lambda x: 1 if x==i else 0)

cat_scl = sklearn.preprocessing.StandardScaler()

train_bin_enc = pd.DataFrame(
    cat_scl.fit_transform(train_bin_enc.values),
    columns=train_bin_enc.columns,
)

test_bin_enc = pd.DataFrame(
    cat_scl.transform(test_bin_enc.values),
    columns=test_bin_enc.columns,
)

In [24]:
train_bin_enc.head(5)

In [25]:
test_bin_enc.head(5)

In [26]:
train_cont_data = df_train[cont_feats]
test_cont_data = df_test[cont_feats]

scl = sklearn.preprocessing.StandardScaler()

train_cont_vals = pd.DataFrame(
    scl.fit_transform(train_cont_data.values),
    columns=train_cont_data.columns,
)
test_cont_vals = pd.DataFrame(
    scl.transform(test_cont_data.values),
    columns=train_cont_data.columns,
)

In [27]:
train_cont_data.head(5)

In [28]:
test_cont_data.head(5)

In [29]:
train_cont_vals.head(5)

In [30]:
test_cont_vals.head(5)

In [31]:
X = train_cont_vals.join(train_bin_enc)
X.fillna(0, inplace=True)

log.info(f"Total number of independent features before projection: {X.shape[1]}")

X_test = test_cont_vals.join(test_bin_enc)
X_test.fillna(0, inplace=True)

2020-11-30 16:17:06,132 - __main__ - INFO - Total number of independent features before projection: 365


In [32]:
X.head(5)

In [33]:
X_test.head(5)

In [34]:
Y_train = df_train[[y_col]]

log_y_col = "log_y"

Y_train[log_y_col] = np.log(Y_train[y_col])

y_scl = sklearn.preprocessing.StandardScaler()

Y = pd.DataFrame(y_scl.fit_transform(Y_train[[log_y_col]].values), columns=Y_train[[log_y_col]].columns)

In [36]:
Y.head(5)

Unnamed: 0,log_y
0,0.560068
1,0.212764
2,0.734046
3,-0.437382
4,1.014651


In [37]:
class MixedDataProcessor(sklearn.base.TransformerMixin):
    """
    """
    
    def __init__(self, categorical_columns, continuous_columns, **kwargs):
        """
        """
        self.categorical_columns = categorical_columns
        self.continuous_columns = continuous_columns
        self.enc = OrdinalEncoderExt()
        self.scl = sklearn.preprocessing.StandardScaler()
        self.y_scl = sklearn.preprocessing.StandardScaler()
        
        
    def fit(self, X, y=None):
        """
        """
        cat_feats = self.categorical_columns
        cont_feats = self.continuous_columns
        
        cont_data = X[cont_feats]

        scl = self.scl

        scl.fit(cont_data.values)
        
        cat_data = X[cat_feats].fillna("")

        enc = self.enc

        enc.fit(cat_data.values)
        
        if y is not None:
            y_scl = self.y_scl
            y_scl.fit(y)

        return 0 
    
    
    def transform(self, X, y=None):
        """
        """
        cat_feats = self.categorical_columns
        cont_feats = self.continuous_columns
        
        cont_data = X[cont_feats]

        scl = self.scl

        cont_vals = pd.DataFrame(
            scl.transform(cont_data.values),
            columns=cont_data.columns,
        )
        
        cat_data = X[cat_feats].fillna("")

        enc = self.enc

        cat_vals = (
            pd.DataFrame(
                enc.transform(cat_data.values), 
                columns=cat_data.columns,
            )
        )
        
        cols = [
            f"{cat_data.columns[i]}_{x}" 
                for i, col in enumerate(enc.categories_) for x in col
        ]
        
        bin_enc = pd.DataFrame()
        
        for j, cat in enumerate(cat_vals.columns):
            for i, col in enumerate(enc.categories_[j]):
                bin_enc[f"{cat}_{col}"] = cat_vals[cat].apply(lambda x: 1 if x==i else 0)
                
        X_p = cont_vals.join(bin_enc).fillna(0)
        
        if y is not None:
            y_p = y_scl.transform(y)
            ret_val = (X_p, y_p)
        else:
            ret_val = X_p

        return ret_val
    
    
    def fit_transform(self, X, y=None):
        """
        """
        self.fit(X, y=y)
        return self.transform(X, y=y)

In [38]:
import sklearn.random_projection
import sklearn.decomposition 
import sklearn.cluster
import sklearn.feature_selection

n_components = 150
f_regression = sklearn.feature_selection.f_regression

projs = [
    sklearn.decomposition.PCA(
        n_components=n_components,
        svd_solver='randomized',
        whiten=True,
    ),
    sklearn.cluster.FeatureAgglomeration(
        n_clusters=n_components,
    ),
    sklearn.feature_selection.SelectPercentile(
        f_regression, 
        percentile=50,
    ),
]

NameError: name 'n_components' is not defined

fig = go.Figure()

hist = go.Histogram(
    x=Y_train[y_col].values, 
    xbins=dict(
        start=0,
        end=Y_train[y_col].max(),
        size=10000,
    ),
)
fig.add_trace(hist)

fig.update_layout(
    go.Layout(
        xaxis = dict(
            rangeslider = {'visible': False},
        ),
    )
)
plotly.offline.iplot(fig)

fig = go.Figure()

hist = go.Histogram(
    x=Y_train[log_y_col].values, 
    xbins=dict(
        start=0,
        end=Y_train[log_y_col].max(),
        size=0.10,
    ),
)
fig.add_trace(hist)

fig.update_layout(
    go.Layout(
        xaxis = dict(
            rangeslider = {'visible': False},
        ),
    )
)
plotly.offline.iplot(fig)

In [None]:
Z = X.join(Y)

Z_corr = Z.corr()

In [None]:
#Z_corr

for col in Z.columns[:2]:
    if col not in (y_col, log_y_col):
        
        cor_x = np.linspace(Z[col].min(), Z[col].max(), 3)
        s = Z_corr.at[log_y_col, col]
        
        fig = go.Figure()
        
        trace = go.Scatter(x=Z[col], y=Z[log_y_col], mode="markers", text=Z.index)
        trace_cor = go.Scatter(x=cor_x, y=s*cor_x, mode="lines")
        
        fig.add_trace(trace)
        fig.add_trace(trace_cor)

        fig.update_layout(
            width=1400,
            height=1200,
            title=dict(text=f"{col} vs. {y_col}"),
            xaxis_title=dict(text=col),
            yaxis_title=dict(text=y_col),
        )
        plotly.offline.iplot(fig)

In [None]:
tf.keras.backend.set_floatx('float64')

class TFModel(tf.keras.Model):
    """
    """
    def __init__(self,
            batch_size=None,
            epochs=3,
            units=40,
    ):
        """
        """
        super(TFModel, self).__init__()

        self.batch_size = batch_size
        self.epochs = epochs
        out_dim = 1
              
        self._dense0 = tf.keras.layers.Dense(
            out_dim,
            activation=tf.nn.relu,
            name="dense_0",
        )
        
        self._optimizer = tf.keras.optimizers.Adam(
                            learning_rate=0.0001
        )
        self._metrics = [tf.keras.metrics.MeanAbsoluteError()]
        self._loss = tf.keras.losses.MeanSquaredError()
        
        self.compile(
            optimizer=self._optimizer,
            loss=self._loss,
            metrics=self._metrics,
        )
        

    def call(self, inputs):       
        """
        """
        out = self._dense0(inputs)
        
        return out
    
    
    def fit(self, X, Y, validation_data=None):
        """
        """
        callbacks=[
            tf.keras.callbacks.EarlyStopping(
                monitor="val_loss",
                min_delta=1e-5,
                patience=10,
                baseline=0.5,
                restore_best_weights=True,
            )
        ]
        return super().fit(
            x=X,
            y=Y,
            epochs=self.epochs,
            #callbacks=callbacks,
            validation_data=validation_data,
        )
    
    
    def get_params(self):
        """
        """
        return self.__dict__

In [None]:
models = [
    sklearn.linear_model.LinearRegression(),
]

tf_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        epochs=[10],
        units=[10, 20, 30],
    )
)
models.extend(
    [TFModel(**params) for params in tf_param_grid]
)

xgb_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        #n_estimators=[100, 110, 150],
        #max_depth=[5, 7],
        learning_rate=[None, 1e-4, 1e-2],
        booster=["gbtree", "gblinear", "dart"],
        reg_alpha=[None, 1e-5, 1e-3],
        reg_lambda=[None, 1e-5, 1e-3],
    )
)
models.extend(
    [xgboost.XGBRegressor(**params) for params in xgb_param_grid]
)

lsvr_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        C=[1, 2, 0.5],
    )
)
models.extend(
    [sklearn.svm.LinearSVR(**params) for params in lsvr_param_grid]
)

svr_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        C=[1, 2, 0.5],
        kernel=["linear", "poly", "rbf", "sigmoid"],
        gamma=["scale", 0.01],
    )
)
models.extend(
    [sklearn.svm.SVR(**params) for params in svr_param_grid]
)
        
kernel_ridge_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha=[1, 2, 1e-1],
        gamma=[None, 1, 0.1],
    )
)
models.extend(
    [sklearn.kernel_ridge.KernelRidge(**params) 
        for params in kernel_ridge_param_grid
    ]
)
        
elastic_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha=[2, 1, 0.5],
        l1_ratio=[0.5, 1, 0.1],
    )
)
models.extend(
    [sklearn.linear_model.ElasticNet(**params) 
        for params in elastic_param_grid
    ]
)

gauss_process_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        kernel=[
            None, 
            sklearn.gaussian_process.kernels.Matern(),
            sklearn.gaussian_process.kernels.DotProduct(),
            sklearn.gaussian_process.kernels.RationalQuadratic(),
        ],
    )
)
models.extend(
    [sklearn.gaussian_process.GaussianProcessRegressor(**params)
        for params in gauss_process_param_grid
    ]
)

elastic_cv_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        l1_ratio=[0.5, 0.1, 0.7, 0.9, 0.95, 1],
    )
)
models.extend(
    [sklearn.linear_model.ElasticNetCV(**params) 
        for params in elastic_cv_param_grid
    ]
)

gradient_boosting_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        n_estimators=[100, 150],
        criterion=["friedman_mse", "mse", "mae"],
        max_depth=[3, 5],
        max_features=[
            "auto",
            "sqrt", "log2"],
    )
)
models.extend(
    [sklearn.ensemble.GradientBoostingRegressor(**params)
        for params in gradient_boosting_param_grid
    ]
)

random_forest_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        n_estimators=[100, 150],
        criterion=["mse", "mae"],
        max_depth=[None, 5],
        max_features=[
            "auto",
            "sqrt", "log2"],
    )
)
models.extend(
    [sklearn.ensemble.RandomForestRegressor(**params)
        for params in random_forest_param_grid
    ]
)
    
ridge_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha=[1, 2, 0.5],
    )
)
models.extend(
    [sklearn.linear_model.Ridge(**params)
         for params in ridge_param_grid
    ]
)

bayes_adr_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        alpha_1=[1e-6, 1e-5],
        alpha_2=[1e-6, 1e-5],
        lambda_1=[1e-6, 1e-5],
        lambda_2=[1e-6, 1e-5],
    )
)
models.extend(
    [sklearn.linear_model.ARDRegression(**params)
        for params in bayes_adr_param_grid
    ]
)

sgd_linear_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        eta0=[0.01, 0.005],
        power_t=[0.25, 0.2],
    )
)
models.extend(
    [sklearn.linear_model.SGDRegressor(**params)
        for params in sgd_linear_param_grid
    ]
)

ada_boost_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        base_estimator=[
            None,
            sklearn.tree.DecisionTreeRegressor(max_depth=4),
        ],
        loss=["linear", "square", "exponential"],
    )
)
models.extend(
    [sklearn.ensemble.AdaBoostRegressor(**params)
        for params in ada_boost_param_grid
    ]
)

bagging_param_grid = sklearn.model_selection.ParameterGrid(
    dict(
        n_estimators=[10, 20],
        max_features=[1.0, 0.2],
        bootstrap=[True, False],
    )
)
models.extend(
    [sklearn.ensemble.BaggingRegressor(**params)
        for params in bagging_param_grid
    ]
)

In [None]:
import sklearn.pipeline
pipe = sklearn.pipeline.Pipeline([('projector', projs[0]), ('model', model)])

In [None]:
import warnings
warnings.filterwarnings("ignore")

pred_cols = [f"model_{i:03d}" for i in range(len(models))]
f = lambda x: np.exp(y_scl.inverse_transform(x))
    
scores = []

#splts = 2
#kf = sklearn.model_selection.KFold(n_splits=splts)
#kf_cols = [f"k{i//2}_scl" if i % 2 ==0 else f"k{i//2}" for i in range(2*splts)]
kf_cols = ["k0_scl", "k0"]

for model in models:
    t1 = time.perf_counter()
    row = []
    
    #for train_index, test_index in kf.split(X.values, Y.values):
    #    model.fit(X.loc[train_index], Y.loc[train_index])
    if True:
        X_train_split, X_test_split, y_train_split, y_test_split = (
            sklearn.model_selection.train_test_split(
                X, 
                Y, 
                test_size=0.2, 
                random_state=42)
        )
        model.fit(X_train_split, y_train_split)
        
        pred = model.predict(X)
        r2 = sklearn.metrics.r2_score(Y[log_y_col], pred)
        try:
            r2t = sklearn.metrics.r2_score(f(Y[log_y_col]), f(pred))
        except ValueError as e:
            log.error(e)
            r2t = np.nan
        row.append(r2)
        row.append(r2t)
    t2 = time.perf_counter()
    row.append(t2-t1)
    scores.append(row)
    log.info(f"{str(model)[:15]} -- time elapsed: {t2-t1:5.3f}")

In [None]:
cv_df = pd.DataFrame(scores, index=pred_cols, columns=kf_cols+["time"])

cv_df["name"] = [str(model) for model in models]
cv_df['params'] = [model.get_params() for model in models]

In [None]:
cv_df.head(5)

In [None]:
cv_df.sort_values(by=['k0_scl'], ascending=False).head(20)

In [None]:
true_labels = pd.read_csv(os.path.join(data_dir, "true_submission.csv"))

true_labels[log_y_col] = np.log(true_labels["SalePrice"])

In [None]:
model_name = cv_df.sort_values(by=['k0'], ascending=False).index[128]
model = models[int(model_name[-3:])]

In [None]:
results = pd.DataFrame({"Id": df_test["Id"], "SalePrice": f(model.predict(X_test)).reshape((-1))})

In [None]:
results_fn = os.path.join(data_dir, "results.csv")
#results.to_csv(results_fn, index=False)

In [None]:
val = np.sqrt(
    sklearn.metrics.mean_squared_error(true_labels[log_y_col], np.log(results[y_col]))
)
log.info(f"RMSE of log: {val}")

In [None]:
for i in range(len(models)):
    model_name = cv_df.sort_values(by=['k0'], ascending=False).index[i]
    model = models[int(model_name[-3:])]
    results = pd.DataFrame({"Id": df_test["Id"], "SalePrice": f(model.predict(X_test)).reshape((-1))})
    try:
        val = np.sqrt(
            sklearn.metrics.mean_squared_error(true_labels[log_y_col], np.log(results[y_col]))
        )
        log.info(f"{i} RMSE of log: {val}")
    except ValueError as e:
        log.error(e)

In [None]:
results