In [122]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor



## Feature Engineering

In [159]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [161]:
train_df = train_df.drop(df.columns[0], axis=1)
test_df = test_df.drop(df.columns[0], axis=1)


In [162]:
missing_value_features = [feature for feature in train_df if train_df[feature].isna().sum() > 0]
for feature in missing_value_features :
    print(f"{feature} have : {train_df[feature].isna().sum()}")

missing_value_features_test = [feature for feature in test_df if test_df[feature].isna().sum() > 0]
for feature in missing_value_features :
    print(f"{feature} have : {test_df[feature].isna().sum()}")

LotFrontage have : 259
Alley have : 1369
MasVnrType have : 872
MasVnrArea have : 8
BsmtQual have : 37
BsmtCond have : 37
BsmtExposure have : 38
BsmtFinType1 have : 37
BsmtFinType2 have : 38
Electrical have : 1
FireplaceQu have : 690
GarageType have : 81
GarageYrBlt have : 81
GarageFinish have : 81
GarageQual have : 81
GarageCond have : 81
PoolQC have : 1453
Fence have : 1179
MiscFeature have : 1406
LotFrontage have : 227
Alley have : 1352
MasVnrType have : 894
MasVnrArea have : 15
BsmtQual have : 44
BsmtCond have : 45
BsmtExposure have : 44
BsmtFinType1 have : 42
BsmtFinType2 have : 42
Electrical have : 0
FireplaceQu have : 730
GarageType have : 76
GarageYrBlt have : 78
GarageFinish have : 78
GarageQual have : 78
GarageCond have : 78
PoolQC have : 1456
Fence have : 1169
MiscFeature have : 1408


In [127]:
train_df = train_df.drop(index=train_df[train_df['Electrical'].isna()].index)
test_df = test_df.drop(index=test_df[test_df['Electrical'].isna()].index)

In [128]:
train_df = train_df.drop(columns=['MiscFeature','PoolQC','Alley','Fence'])

In [129]:
df['MasVnrType'].unique()

array(['BrkFace', nan, 'Stone', 'BrkCmn'], dtype=object)

In [130]:
df['MasVnrType'].fillna('None',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MasVnrType'].fillna('None',inplace=True)


In [131]:
median = df['LotFrontage'].median()
df['LotFrontage'] = df['LotFrontage'].fillna(median)

In [132]:
df['FireplaceQu'].unique()

array([nan, 'TA', 'Gd', 'Fa', 'Ex', 'Po'], dtype=object)

In [133]:
df['FireplaceQu'].fillna('None', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['FireplaceQu'].fillna('None', inplace=True)


In [134]:
df['MasVnrArea'] = df['MasVnrArea'].fillna(df['MasVnrArea'].median())

In [135]:
df['BsmtQual']= df['BsmtQual'].fillna('No')
df['BsmtCond'] = df['BsmtCond'].fillna('No')
df['BsmtExposure']= df['BsmtExposure'].fillna('No')
df['BsmtFinType1'] = df['BsmtFinType1'].fillna('No')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('No')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('No')
df['GarageType'] = df['GarageType'].fillna('No')
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
df['GarageFinish'] = df['GarageFinish'].fillna('No')
df['GarageQual'] = df['GarageQual'].fillna('No')
df['GarageCond'] = df['GarageCond'].fillna('No')


In [136]:
df.isna().sum()


MSSubClass       0
MSZoning         0
LotFrontage      0
LotArea          0
Street           0
                ..
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
SalePrice        0
Length: 76, dtype: int64

In [137]:
df['NumYearBuild'] = df['YrSold'] - df['YearBuilt']
df['NumYearRemod'] = df['YrSold'] - df['YearRemodAdd']
df['NumYearGarage'] = df['YrSold'] - df['GarageYrBlt']

In [138]:
y = df['SalePrice']
df = df.drop(columns=['SalePrice'],axis=1)
X = df

In [139]:
numeric_features = [ feature for feature in df if df[feature].dtype != 'O']
categorical_feature = [feature for feature in df if feature not in numeric_features]
print(len(numeric_features))
print(len(categorical_feature))

year_features = [feature for feature in numeric_features if "Year" in feature or "Yr" in feature]
discrete_numerical_feature = [feature for feature in numeric_features if len((df[feature]).unique()) < 25 and feature not in year_features and feature not in ['Id','SalePrice']]
continous_feature = [feature for feature in numeric_features if feature not in discrete_numerical_feature+year_features and feature not in ['Id','SalePrice']]

39
39


In [140]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

In [141]:
for feature in numeric_features:
    df = remove_outliers(df,feature)
    

In [142]:
continous_feature

['LotFrontage',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch']

In [144]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_feature),
         ("StandardScaler", numeric_transformer, numeric_features),        
    ]
)

In [145]:
X

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,NumYearBuild,NumYearRemod,NumYearGarage
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,2,2008,WD,Normal,5,5,5.0
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,5,2007,WD,Normal,31,31,31.0
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,9,2008,WD,Normal,7,6,7.0
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,0,0,0,2,2006,WD,Abnorml,91,36,8.0
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,12,2008,WD,Normal,8,8,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,8,2007,WD,Normal,8,7,8.0
1456,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,2,2010,WD,Normal,32,22,32.0
1457,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,2500,5,2010,WD,Normal,69,4,69.0
1458,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,4,2010,WD,Normal,60,14,60.0


In [146]:
X = preprocessor.fit_transform(X)

In [147]:
# separate dataset into train and test
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
# X_train.shape, X_test.shape

((1167, 287), (292, 287))

In [148]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [150]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [155]:
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 21025.2026
- Mean Absolute Error: 13438.6204
- R2 Score: 0.9324
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 25656.9080
- Mean Absolute Error: 17377.1763
- R2 Score: 0.8781




  model = cd_fast.sparse_enet_coordinate_descent(


Lasso
Model performance for Training set
- Root Mean Squared Error: 21028.3177
- Mean Absolute Error: 13448.6430
- R2 Score: 0.9324
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 24086.3232
- Mean Absolute Error: 16977.7505
- R2 Score: 0.8926


Ridge
Model performance for Training set
- Root Mean Squared Error: 23190.3929
- Mean Absolute Error: 14618.4509
- R2 Score: 0.9177
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 24075.6370
- Mean Absolute Error: 17576.1032
- R2 Score: 0.8927


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 33137.9677
- Mean Absolute Error: 18451.7018
- R2 Score: 0.8321
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 32421.1969
- Mean Absolute Error: 22141.5240
- R2 Score: 0.8054


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.000

In [157]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,R2_Score
2,Ridge,0.892692
1,Lasso,0.892596
5,Random Forest Regressor,0.88243
0,Linear Regression,0.878133
6,XGBRegressor,0.87193
3,K-Neighbors Regressor,0.805403
7,AdaBoost Regressor,0.805251
4,Decision Tree,0.671095
