In [1]:
# import libraries
import os
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from typing import Optional
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

warnings.filterwarnings('ignore')

In [2]:
# load data
data_location = f'./local'
train_file_name = 'train.csv'
test_file_name = 'test.csv'

In [3]:
df_train = pd.read_csv(os.path.join(data_location, train_file_name))
df_test = pd.read_csv(os.path.join(data_location, test_file_name))

In [4]:
# quick data exploration
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [5]:
# drop column(s) where the percentage of null value is more than or equal to 50%
def drop_null_cols(df: pd.DataFrame, ratio: Optional[float] =.5) -> pd.DataFrame:
    newDF = df.copy()
    for col in newDF.columns:
        if newDF[col].isnull().sum()/df.shape[0] >= ratio:
            newDF.drop(columns=col, inplace=True)
    return newDF

In [6]:
# full missing fields based on column type
def full_missing_fields(df: pd.DataFrame) -> pd.DataFrame:
    newDF = df.copy()
    miss_cols = newDF.columns[newDF.isnull().sum() > 0]
    obj_cols = list(newDF.select_dtypes(include='object').columns)
    num_cols = list(newDF.select_dtypes(include='number').columns)
    
    obj_cols = list(set(obj_cols) & set(miss_cols))
    num_cols = list(set(num_cols) & set(miss_cols))
    
    imputer_freq = SimpleImputer(strategy='most_frequent')
    imputer_mean = SimpleImputer(strategy='mean')
    
    newDF[obj_cols] = imputer_freq.fit_transform(newDF[obj_cols])
    newDF[num_cols] = imputer_mean.fit_transform(newDF[num_cols])
    return newDF

In [7]:
# one hot transformation
def one_hot_encode(df: pd.DataFrame):
    newDF = df.copy()
    obj_cols = list(newDF.select_dtypes(include='object'))
    try: obj_cols.remove('type')
    except: pass
    for col in obj_cols:
        dummies = pd.get_dummies(newDF[col], prefix=col)
        newDF = pd.concat([newDF, dummies], axis=1)
        newDF.drop(columns=col, inplace=True)
    return newDF

In [8]:
# transform both dataset
df_train = drop_null_cols(df_train)
df_test = drop_null_cols(df_test)
df_train = full_missing_fields(df_train)
df_test = full_missing_fields(df_test)

In [9]:
# combine train and test dataframe for one-hot encoding
# the combination prevents values in test set does not
# exist in train set
df_train['type'] = 'train'
df_test['type'] = 'test'
df_merge = pd.concat([df_train, df_test], ignore_index=True).reset_index(drop=True)

In [10]:
df_merge = one_hot_encode(df_merge)
df_merge = df_merge.loc[:,~df_merge.columns.duplicated()]
df_merge.shape

(2919, 278)

In [11]:
df_train = df_merge[df_merge['type'] == 'train']
df_test = df_merge[df_merge['type'] == 'test']
df_train.drop(columns='type', inplace=True)
df_test.drop(columns='type', inplace=True)

### Model

In [12]:
# import libraries
from xgboost import XGBRegressor
import pickle

In [13]:
xgb = XGBRegressor(random_state=0)

In [14]:
x_col = list(df_train.columns)
y_col = 'SalePrice'
x_col.remove(y_col)
xgb.fit(df_train[x_col], df_train[y_col])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [15]:
# store trained result
filename = './model/firstSubmission.pkl'
pickle.dump(xgb, open(filename, 'wb'))

In [16]:
# generate result dataframe
df_test['y_pred'] = xgb.predict(df_test[x_col])
submission = df_test[['Id', 'y_pred']].rename(columns={'y_pred':y_col})
submission.to_csv('./submission/firstSubmission.csv', index=False, header=True)