In [103]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from tsfresh.feature_selection import select_features
import lightgbm as lgb

In [127]:
d_train_raw = pd.read_csv("../data/raw/train.csv")
# d_test_raw = pd.read_csv("../data/raw/test.csv")

In [128]:
# d_train_raw["train"] = 1
# d_test_raw["train"] = 0
# d_train_raw = pd.concat([d_train_raw, d_test_raw])
d_train_feats = d_train_raw.drop(columns=["Id"])
d_train_feats.reset_index(drop=True, inplace=True)

In [129]:
d_train_feats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 18  YearBuil

In [130]:
# remove certain columns
cols_to_drop = [
    "LotFrontage",
    "Alley",
    "MasVnrType",
    "MasVnrArea",
    "BsmtQual",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "Electrical",
    "FireplaceQu",
    "GarageType",
    "GarageYrBlt",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "PoolQC",
    "Fence",
    "MiscFeature",
    "MiscVal",
    "SaleType",
    "SaleCondition"
]

d_train = d_train_feats.drop(columns=cols_to_drop)

In [131]:
# d_train = d_train.dropna(axis=0)
d_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 58 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MSSubClass     1460 non-null   int64 
 1   MSZoning       1460 non-null   object
 2   LotArea        1460 non-null   int64 
 3   Street         1460 non-null   object
 4   LotShape       1460 non-null   object
 5   LandContour    1460 non-null   object
 6   Utilities      1460 non-null   object
 7   LotConfig      1460 non-null   object
 8   LandSlope      1460 non-null   object
 9   Neighborhood   1460 non-null   object
 10  Condition1     1460 non-null   object
 11  Condition2     1460 non-null   object
 12  BldgType       1460 non-null   object
 13  HouseStyle     1460 non-null   object
 14  OverallQual    1460 non-null   int64 
 15  OverallCond    1460 non-null   int64 
 16  YearBuilt      1460 non-null   int64 
 17  YearRemodAdd   1460 non-null   int64 
 18  RoofStyle      1460 non-null

In [132]:
porch_cols = [c for c in d_train.columns if "Porch" in c]
d_train["PorchArea"] = d_train[porch_cols].sum(axis=1)
d_train = d_train.reset_index(drop=True)
d_train_idx = d_train.index
d_target = d_train["SalePrice"]

In [133]:
d_train.drop(columns=["SalePrice"], inplace=True)

In [134]:
all_cols = set(d_train.columns)
num_cols = [
    "LotArea",
    "BsmtFinSF1",
    "BsmtFinSF2",
    "BsmtUnfSF",
    "TotalBsmtSF",
    "1stFlrSF",
    "2ndFlrSF",
    "GrLivArea",
    "BsmtFullBath",
    "BsmtHalfBath",
    "FullBath",
    "BedroomAbvGr",
    "KitchenAbvGr",
    "TotRmsAbvGrd",
    "Fireplaces",
    "GarageArea",
    "WoodDeckSF",
    "PorchArea",
    "PoolArea"
]
cat_cols = list(all_cols - set(num_cols) - {"Exterior1st"})

In [135]:
oe = OrdinalEncoder()
oe.fit(d_train[cat_cols])
d_enc = pd.DataFrame(oe.transform(d_train[cat_cols]), columns=cat_cols, index=d_train_idx)

In [136]:
d_feats = pd.concat([d_train[num_cols], d_enc], axis=1)
d_feats.head()

Unnamed: 0,LotArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,BsmtFullBath,BsmtHalfBath,...,MoSold,YrSold,Street,Condition1,LandContour,RoofStyle,ExterQual,ScreenPorch,ExterCond,MSZoning
0,8450,706,0,150,856,856,854,1710,1,0,...,1.0,2.0,1.0,2.0,3.0,1.0,2.0,0.0,4.0,3.0
1,9600,978,0,284,1262,1262,0,1262,0,1,...,4.0,1.0,1.0,1.0,3.0,1.0,3.0,0.0,4.0,3.0
2,11250,486,0,434,920,920,866,1786,1,0,...,8.0,2.0,1.0,2.0,3.0,1.0,2.0,0.0,4.0,3.0
3,9550,216,0,540,756,961,756,1717,1,0,...,1.0,0.0,1.0,2.0,3.0,1.0,3.0,0.0,4.0,3.0
4,14260,655,0,490,1145,1145,1053,2198,1,0,...,11.0,2.0,1.0,2.0,3.0,1.0,2.0,0.0,4.0,3.0


In [137]:
d_train_filt = select_features(d_feats, d_target)

In [145]:
cat_cols = set(d_train_filt.columns) & set(cat_cols)

In [146]:
train_idx = d_train_filt.sample(frac=0.7).index
test_idx = pd.Index(set(d_train_filt.index) - set(train_idx))

In [147]:
train = lgb.Dataset(
    data=d_train_filt.loc[train_idx, :],
    label=d_target.loc[train_idx],
    categorical_feature=cat_cols
)

test = lgb.Dataset(
    data=d_train_filt.loc[test_idx, :],
    label=d_target.loc[test_idx],
    categorical_feature=cat_cols
)

In [159]:
hp = {
    "objective": "mean_squared_error",
    "learning_rate": 0.005,
    "num_iterations": 2000,
    "early_stopping_round": 300,
    "metric": "l1",
    "verbose": -1,
}

In [160]:
model = lgb.train(
    params=hp,
    train_set=train,
    categorical_feature=cat_cols,
    verbose_eval=50,
    valid_names=["train", "test"],
    valid_sets=[train, test]
)

Training until validation scores don't improve for 300 rounds
[50]	train's l1: 47510.1	test's l1: 49338.1
[100]	train's l1: 40764	test's l1: 42988.7
[150]	train's l1: 36322.7	test's l1: 39073.2
[200]	train's l1: 32951.1	test's l1: 36224.6
[250]	train's l1: 30623.9	test's l1: 34388.8
[300]	train's l1: 28913.3	test's l1: 33093.9
[350]	train's l1: 27433.1	test's l1: 31982.8
[400]	train's l1: 26178.2	test's l1: 31304.1
[450]	train's l1: 25301.3	test's l1: 30837
[500]	train's l1: 24604.4	test's l1: 30438.3
[550]	train's l1: 24079.1	test's l1: 30114.4
[600]	train's l1: 23681.9	test's l1: 29901.3
[650]	train's l1: 23337.9	test's l1: 29717
[700]	train's l1: 23027.7	test's l1: 29623
[750]	train's l1: 22771.2	test's l1: 29625.1
[800]	train's l1: 22550.6	test's l1: 29572.9
[850]	train's l1: 22264.2	test's l1: 29498.4
[900]	train's l1: 22027.7	test's l1: 29429.7
[950]	train's l1: 21809.2	test's l1: 29351.7
[1000]	train's l1: 21594.7	test's l1: 29350.8
[1050]	train's l1: 21381.5	test's l1: 29358.3
