## Use Light GBM to predict house sale price
- Use label encoder to encode categorial variables
- Remove attributes that provide little values (attributes that are only relevant for a small subset of the dataset)
- Perform imputation to fill in data missing for some of the dataset

In [1]:
import pandas as pd
from pandas_summary import DataFrameSummary
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
%matplotlib inline

data_dir = '_data/'
dataset_name = 'drop_missing'
train_file = 'train_' + dataset_name + '.csv'
test_file = 'test_' + dataset_name + '.csv'

In [2]:
df_train = pd.read_csv(data_dir + train_file).drop('Id', axis=1)
df_test = pd.read_csv(data_dir + test_file)
target_col = 'SalePrice'

df_X = df_train.drop(target_col, axis=1)
submissionX = df_test.drop('Id', axis=1)

In [3]:
df_X.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,61,0,0,0,0,2,2008,WD,Normal
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,298,0,0,0,0,0,5,2007,WD,Normal
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,42,0,0,0,0,9,2008,WD,Normal
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,35,272,0,0,0,2,2006,WD,Abnorml
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,192,84,0,0,0,0,12,2008,WD,Normal


In [4]:
# Label encode categorical variables
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

categorical_cols = df_train.dtypes[df_train.dtypes == 'object']
for c in categorical_cols.index.values:
    test_c_values = df_X[c].fillna('NAN').values
    submission_c_values = submissionX[c].fillna('NAN').values
    all_x = np.concatenate((test_c_values, submission_c_values))
    le.fit(all_x)
    df_X[c] = le.transform(test_c_values)
    submissionX[c] = le.transform(submission_c_values)
    
    # LGB expected missing values to be set to -1 for categorical variable
    if np.isin('NAN', all_x):
        transformed_na = le.transform(['NAN'])[0]
        df_X[c] = df_X[c].replace({transformed_na:-1})
        submissionX[c] = submissionX[c].replace({transformed_na:-1})
    
from sklearn.model_selection import train_test_split
X = df_X.to_numpy()
y = df_train[target_col].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
validation_data = lgb.Dataset(X_test, label=y_test)
train_data = lgb.Dataset(X_train, label=y_train)

In [5]:
# Example configuration for regression problem - https://github.com/microsoft/LightGBM/blob/master/examples/regression/train.conf
param = {'num_leaves': 31, 'objective': 'regression'}
param['metric'] = 'l2' # default for regression
num_round = 20
lgb_model = lgb.train(param, train_data, num_round, valid_sets=[validation_data])

[1]	valid_0's l2: 6.28628e+09
[2]	valid_0's l2: 5.44644e+09
[3]	valid_0's l2: 4.75442e+09
[4]	valid_0's l2: 4.15934e+09
[5]	valid_0's l2: 3.69249e+09
[6]	valid_0's l2: 3.28656e+09
[7]	valid_0's l2: 2.92111e+09
[8]	valid_0's l2: 2.65178e+09
[9]	valid_0's l2: 2.41234e+09
[10]	valid_0's l2: 2.21536e+09
[11]	valid_0's l2: 2.04542e+09
[12]	valid_0's l2: 1.90075e+09
[13]	valid_0's l2: 1.77057e+09
[14]	valid_0's l2: 1.6716e+09
[15]	valid_0's l2: 1.58412e+09
[16]	valid_0's l2: 1.48658e+09
[17]	valid_0's l2: 1.42755e+09
[18]	valid_0's l2: 1.36499e+09
[19]	valid_0's l2: 1.31198e+09
[20]	valid_0's l2: 1.26826e+09


In [6]:
submissiony = lgb_model.predict(submissionX)

In [7]:
df_test['SalePrice'] = submissiony

In [8]:
prediction_file = 'lgb_prediction_' + dataset_name + '.csv'
df_test[['Id', 'SalePrice']].to_csv(data_dir + prediction_file, index=False)

In [9]:
# Model 2
# Drop columns with lots of missing values
drop_cols = ['Alley', 'Fence','MiscFeature','PoolArea', 'PoolQC']
train2 = df_X.drop(drop_cols, axis=1)

KeyError: "['Alley' 'Fence' 'MiscFeature' 'PoolArea' 'PoolQC'] not found in axis"