In [34]:
from pytorch_tabnet.tab_model import TabNetRegressor

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

import pandas as pd
import numpy as np
np.random.seed(0)


import os
import wget
from pathlib import Path

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [35]:
train = pd.read_csv(filepath_or_buffer='train.csv')
target = 'SalePrice'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.9, 0.05, 0.05], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [36]:
categorical_columns = []
categorical_dims =  {}
for col in train.columns[train.dtypes == object]:
    print(col, train[col].nunique())
    l_enc = LabelEncoder()
    train[col] = train[col].fillna("VV_likely")
    train[col] = l_enc.fit_transform(train[col].values)
    categorical_columns.append(col)
    categorical_dims[col] = len(l_enc.classes_)

for col in train.columns[train.dtypes == 'float64']:
    train.fillna(train.loc[train_indices, col].mean(), inplace=True)

MSZoning 5
Street 2
Alley 2
LotShape 4
LandContour 4
Utilities 2
LotConfig 5
LandSlope 3
Neighborhood 25
Condition1 9
Condition2 8
BldgType 5
HouseStyle 8
RoofStyle 6
RoofMatl 8
Exterior1st 15
Exterior2nd 16
MasVnrType 4
ExterQual 4
ExterCond 5
Foundation 6
BsmtQual 4
BsmtCond 4
BsmtExposure 4
BsmtFinType1 6
BsmtFinType2 6
Heating 6
HeatingQC 5
CentralAir 2
Electrical 5
KitchenQual 4
Functional 7
FireplaceQu 5
GarageType 6
GarageFinish 3
GarageQual 5
GarageCond 5
PavedDrive 3
PoolQC 3
Fence 4
MiscFeature 4
SaleType 9
SaleCondition 6
Set 3


In [37]:
unused_feat = ['Id']

features = [ col for col in train.columns if col not in unused_feat+[target]]

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

# print(cat_dims)

# define your embedding sizes : here just a random choice
# cat_emb_dim = [5, 4, 3, 6, 2, 2, 1, 10]
cat_emb_dim = (np.ones(len(cat_dims))*100).astype(np.int)

In [38]:
clf = TabNetRegressor(cat_dims=cat_dims, cat_emb_dim=cat_emb_dim, cat_idxs=cat_idxs)

Device used : cuda


In [39]:
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices].reshape(-1, 1)/1000000

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices].reshape(-1, 1)/1000000

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices].reshape(-1, 1)/1000000

In [40]:
max_epochs = 300 if not os.getenv("CI", False) else 2

In [41]:
clf.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=['train', 'valid'],
    eval_metric=['mae', 'mse'],
    max_epochs=max_epochs,
    patience=50,
    batch_size=1024, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
)

epoch 0  | loss: 0.73285 | train_mae: 0.17449 | train_mse: 0.03836 | valid_mae: 0.19546 | valid_mse: 0.04843 |  0:00:01s
epoch 1  | loss: 0.38767 | train_mae: 0.21083 | train_mse: 0.11176 | valid_mae: 0.20842 | valid_mse: 0.05219 |  0:00:01s
epoch 2  | loss: 0.30598 | train_mae: 0.17737 | train_mse: 0.03833 | valid_mae: 0.1948  | valid_mse: 0.0469  |  0:00:02s
epoch 3  | loss: 0.17778 | train_mae: 0.15123 | train_mse: 0.02921 | valid_mae: 0.16537 | valid_mse: 0.03602 |  0:00:02s
epoch 4  | loss: 0.09332 | train_mae: 0.13332 | train_mse: 0.02474 | valid_mae: 0.1474  | valid_mse: 0.03029 |  0:00:03s
epoch 5  | loss: 0.08375 | train_mae: 0.12866 | train_mse: 0.02354 | valid_mae: 0.14271 | valid_mse: 0.02959 |  0:00:03s
epoch 6  | loss: 0.05585 | train_mae: 0.13281 | train_mse: 0.02477 | valid_mae: 0.14691 | valid_mse: 0.03048 |  0:00:04s
epoch 7  | loss: 0.0484  | train_mae: 0.13336 | train_mse: 0.0251  | valid_mae: 0.14637 | valid_mse: 0.02945 |  0:00:04s
epoch 8  | loss: 0.0361  | train

In [42]:
preds = clf.predict(X_test)

y_true = y_test

test_score = mean_squared_error(y_pred=preds, y_true=y_true)

print(f"BEST VALID SCORE: {clf.best_cost}")
print(f"FINAL TEST SCORE: {test_score}")

np.mean(np.abs(y_true-preds)*1000000)

BEST VALID SCORE: 0.0034919706980819673
FINAL TEST SCORE: 0.0023986513055675486


37511.12926712762

Result is better, than result showed by simple nn, but this result is still long way from the result shown by wooden models.