In [1]:
import numpy as np
import pandas as pd
from category_encoders.cat_boost import CatBoostEncoder
from typing import List
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb

In [2]:
train = pd.read_csv("../data/home_price/train.csv")
test = pd.read_csv("../data/home_price/test.csv")

data = pd.concat([train, test], sort=False)

In [3]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Abnorml,
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,,,,0,9,2006,WD,Abnorml,
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,Shed,700,7,2006,WD,Normal,


In [4]:
categorical_features = [
    "Neighborhood"
]

not_categorical_features = [
    "OverallQual","YearBuilt"
]

trg_feature = [
    "SalePrice"
]


used_columns = categorical_features + not_categorical_features + trg_feature

data = data[used_columns]
data

Unnamed: 0,Neighborhood,OverallQual,YearBuilt,SalePrice
0,CollgCr,7,2003,208500.0
1,Veenker,6,1976,181500.0
2,CollgCr,7,2001,223500.0
3,Crawfor,7,1915,140000.0
4,NoRidge,8,2000,250000.0
...,...,...,...,...
1454,MeadowV,4,1970,
1455,MeadowV,4,1970,
1456,Mitchel,5,1960,
1457,Mitchel,5,1992,


In [5]:
data.describe()

Unnamed: 0,OverallQual,YearBuilt,SalePrice
count,2919.0,2919.0,1460.0
mean,6.089072,1971.312778,180921.19589
std,1.409947,30.291442,79442.502883
min,1.0,1872.0,34900.0
25%,5.0,1953.5,129975.0
50%,6.0,1973.0,163000.0
75%,7.0,2001.0,214000.0
max,10.0,2010.0,755000.0


In [6]:
data.isnull().sum()

Neighborhood       0
OverallQual        0
YearBuilt          0
SalePrice       1459
dtype: int64

In [7]:
def encode_categorial_features(data:pd.DataFrame, categoricl_features:List[str]) -> pd.DataFrame:
    tmp_train = data[:len(train)]
    tmp_test = data[len(train):]

    ce = CatBoostEncoder()
    for feature in categoricl_features:
        tmp_train[feature] = ce.fit_transform(tmp_train[feature], tmp_train["SalePrice"])
        tmp_test[feature] = ce.transform(tmp_test[feature])

    return pd.concat([tmp_train, tmp_test], sort=False)


encoded_data = encode_categorial_features(data, categorical_features)
encoded_data
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_train[feature] = ce.fit_transform(tmp_train[feature], tmp_train["SalePrice"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_test[feature] = ce.transform(tmp_test[feature])


Unnamed: 0,Neighborhood,OverallQual,YearBuilt,SalePrice
0,180921.195890,7,2003,208500.0
1,180921.195890,6,1976,181500.0
2,194710.597945,7,2001,223500.0
3,180921.195890,7,1915,140000.0
4,180921.195890,8,2000,250000.0
...,...,...,...,...
1454,103151.177549,4,1970,
1455,103151.177549,4,1970,
1456,156763.143918,5,1960,
1457,156763.143918,5,1992,


In [8]:
X_train = encoded_data[:len(train)].drop("SalePrice",axis=1)
y_train = encoded_data[:len(train)]["SalePrice"]

X_test = encoded_data[len(train):].drop("SalePrice",axis=1)

In [9]:
X_train

Unnamed: 0,Neighborhood,OverallQual,YearBuilt
0,180921.195890,7,2003
1,180921.195890,6,1976
2,194710.597945,7,2001
3,180921.195890,7,1915
4,180921.195890,8,2000
...,...,...,...
1455,192929.458176,6,1999
1456,188651.728711,6,1978
1457,208946.709723,7,1941
1458,146019.507537,5,1950


In [10]:
y_train

0       208500.0
1       181500.0
2       223500.0
3       140000.0
4       250000.0
          ...   
1455    175000.0
1456    210000.0
1457    266500.0
1458    142125.0
1459    147500.0
Name: SalePrice, Length: 1460, dtype: float64

In [11]:
X_test

Unnamed: 0,Neighborhood,OverallQual,YearBuilt
0,146002.275203,5,1961
1,146002.275203,6,1958
2,192705.339949,5,1997
3,192705.339949,6,1998
4,305515.238303,8,1992
...,...,...,...
1454,103151.177549,4,1970
1455,103151.177549,4,1970
1456,156763.143918,5,1960
1457,156763.143918,5,1992


In [23]:
y_preds = []
models = []
oof_train = np.zeros((len(X_train), ))

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [24]:
params = {
    "objective":"binary",
    "max_bin":500,
    "learning_rate":0.05,
    "num_leaves":80
}

In [27]:
for fold_id, (train_index, valid_index) in enumerate(cv.split(X_train, y_train)):
    print(fold_id, train_index.shape, valid_index.shape)

0 (1314,) (146,)
1 (1314,) (146,)
2 (1314,) (146,)
3 (1314,) (146,)
4 (1314,) (146,)
5 (1314,) (146,)
6 (1314,) (146,)
7 (1314,) (146,)
8 (1314,) (146,)
9 (1314,) (146,)


