In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


Log History:
- 230303: Notebook Created
- 230312: GridSearchCV for hyperparameter tuning + Add ...
- 230314: ANOVA analysis for categorical variable
- 230315: Log loss to solve skewness for both numerical features and numerical target variable

Notebook table of contents: <br>
0. Get data and libraries
1. Preprocessing
2. Exploratory Data Analysis
3. Machine Learning Model

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error

In [3]:
df = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df.drop(columns = ['Id'],inplace = True)
df_original = df.copy()

test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
test.drop(columns = ['Id'], inplace = True)
test_original = test.copy()

print(df.shape)
print(test.shape)

(1460, 80)
(1459, 79)


In [4]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
cat_cols = []
num_cols = []
for col in df.columns:
    if df[col].dtypes == object:
        cat_cols.append(col)
    else:
        if (col != 'SalePrice'):
            num_cols.append(col)

# 2. Exploratory Data Analysis

## 2.0. Missing data

In [6]:
for col in num_cols:
    m = df[col].mean()
    df[col].fillna(m, inplace = True)
    m = test[col].mean()
    test[col].fillna(m, inplace = True)
for col in cat_cols:
    df[col].fillna('Not Available', inplace = True)
    test[col].fillna('Not Available', inplace = True)

## 2.1 Analysis of the target variable by each of the categorical variable

Here we perform an 1-way ANOVA analysis to determine the strength of the correlation between a single categorical and the numerical target variable (SalePrice)
Read about the analysis here: http://www.sefidian.com/2022/08/02/measure-the-correlation-between-numerical-and-categorical-variables-and-the-correlation-between-two-categorical-variables-in-python-chi-square-and-anova/

In [7]:
"""Because building boxplots for each of 43 categorical features is very redundant
    and not visually appealing, we first explore the data by building a simple model
    to predict the housing price based solely on each categorical feature"""


from scipy.stats import f_oneway
cat_feat_res = []
for feat in cat_cols:
    cats = df.groupby(feat)['SalePrice'].apply(list)
    f_val, p_val = f_oneway(*cats)
    if (p_val < 0.05):
        cat_feat_res.append((feat, 'c', f_val))
cat_feat_res.sort(key = lambda x : -x[2])

## 2.2. SalePrice by numerical values

### 2.2.1. Sort features by their correlation values

In [8]:
num_feat_res = []
for feat in num_cols:
    cor = df[[feat, 'SalePrice']].corr().iloc[0,1]
    num_feat_res.append((feat, 'n', cor))
num_feat_res.sort(key = lambda x : -np.abs(x[2]))

### 2.2.2. Check skewness and log transformation

In [9]:
from scipy.stats import skew

In [10]:
# Skewing all features (Log level transform)
skewed_feats = df[num_cols].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
# skewed_feats = data_b.apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("Skew in numerical features: ")
skewness = pd.DataFrame({'Skew' :skewed_feats})

skewness = skewness[abs(skewness) > 1]
print("There are {} skewed numerical features to Log1p transform".format(skewness.shape[0]))

skewed_features = skewness.index

for feat in skewed_features:
    df[feat] = np.log1p(df[feat])
    if (feat not in test.columns):
        continue
    test[feat] = np.log1p(test[feat])

y = df['SalePrice'].copy()
y = np.log1p(y)

Skew in numerical features: 
There are 36 skewed numerical features to Log1p transform


# 3. Models

## 3.0 Pre model

Different Methods to choose the features for the model:
1. Choose the top 10 features with the lowest MAPE, regardless of being cat or num 
2. Choose the top 5 cat features with the lowest MAPE and 5 num features
<br>
Change the number from 10 to 14 and 20. 

Because numerical and categorical features' strength of relationship with the target variable are in different scale (the numerical variables are between 0,1 and the categrical are > 0 only), we prioritize using the second strategy

In [11]:
#1. Top 10 features regardless
def choose_features(choice = 2, n_num = 10, n_cat = 10, num = num_feat_res, cat = cat_feat_res):
    feats = []
    to_transform = []
    if choice == 1:
        best_cols = (num + cat)
        best_cols.sort(key = lambda x: x[2])
        for i in range(n_num + n_cat):
            feats.append(best_cols[i][0])
            feat_type = best_cols[i][1]
            if feat_type == 'c':
                to_transform.append(best_cols[i][0])
    else:
        for i in range(n_num):
            feats.append(num[i][0])
        for i in range(n_cat):
            feats.append(cat[i][0])
            to_transform.append(cat[i][0])
    return feats, to_transform

In [12]:
feats, to_transform = choose_features(choice  = 2, n_num = 15, n_cat = 15)
X_train = df[feats].copy()
X_test = test[feats].copy()

In [13]:
encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(X_train[to_transform])

transformed = encoder.transform(X_train[to_transform]).toarray()
feature_names = encoder.get_feature_names_out(to_transform)

X_train = pd.concat([X_train.drop(to_transform, axis = 1),
              pd.DataFrame(transformed, columns = feature_names)], axis = 1)
X_train.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM
0,2.079442,7.444833,1.098612,6.308098,6.753438,6.753438,1.098612,2.197225,7.6029,7.6029,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.94591,7.141245,1.098612,6.133398,7.141245,7.141245,1.098612,1.94591,7.589336,7.589336,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2.079442,7.488294,1.098612,6.411818,6.82546,6.82546,1.098612,1.94591,7.601902,7.602401,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2.079442,7.448916,1.386294,6.466145,6.629363,6.869014,0.693147,2.079442,7.557995,7.586296,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.197225,7.695758,1.386294,6.729824,7.044033,7.044033,1.098612,2.302585,7.601402,7.601402,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


## 3.1. Testing models

## 3.2. Tuning hyperparameters for models

In [14]:
#for random forest regressor

param_grids = {'n_estimators' : [100, 200], 'criterion':['squared_error', 'absolute_error', 'friedman_mse']}
gscv_rfr = GridSearchCV(RandomForestRegressor(), param_grids, scoring = 'neg_mean_absolute_percentage_error')
gscv_rfr.fit(X_train, y)
print(mean_absolute_percentage_error(gscv_rfr.predict(X_train),y))

0.0030138958544748485


In [15]:
res = np.exp(gscv_rfr.predict(X_train)+1)
print(res[:5])

[559929.48435564 478593.61392927 594386.04009292 431050.50012068
 718722.90457271]


In [16]:
print(mean_absolute_percentage_error(np.exp(y)-1, df['SalePrice']))

4.563485658700936e-16


# 4. Predict the Test

In [17]:
transformed = encoder.transform(X_test[to_transform]).toarray()

X_test = pd.concat([X_test.drop(to_transform, axis = 1),
              pd.DataFrame(transformed, columns = feature_names)], axis = 1)
X_test.head()

Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,TotalBsmtSF,1stFlrSF,FullBath,TotRmsAbvGrd,YearBuilt,YearRemodAdd,...,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM
0,1.791759,6.799056,0.693147,6.594413,6.783325,6.799056,0.693147,1.791759,7.58172,7.58172,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.94591,7.192934,0.693147,5.746203,7.192934,7.192934,0.693147,1.94591,7.580189,7.580189,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.791759,7.396335,1.098612,6.180017,6.834109,6.834109,1.098612,1.94591,7.599902,7.600402,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.94591,7.380879,1.098612,6.154858,6.831954,6.831954,1.098612,2.079442,7.600402,7.600402,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2.197225,7.155396,1.098612,6.228511,7.155396,7.155396,1.098612,1.791759,7.597396,7.597396,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [18]:
submission = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv')['Id']
y_hat = np.exp(gscv_rfr.predict(X_test)) - 1

In [19]:
sub = pd.concat([submission, 
                 pd.DataFrame(y_hat, columns = ['SalePrice'])], axis = 1)
sub.to_csv('submission.csv', index = False)
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,119589.918365
1,1462,151165.733983
2,1463,175997.822961
3,1464,183569.139738
4,1465,199455.762447
