In [55]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor



## Feature Engineering

In [56]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [57]:
train_df = train_df.drop(train_df.columns[0], axis=1)
test_df = test_df.drop(test_df.columns[0], axis=1)


In [58]:
test_df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
4,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
1455,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
1456,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
1457,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [60]:
missing_value_features = [feature for feature in train_df if train_df[feature].isna().sum() > 0]
for feature in missing_value_features :
    # print(f"{feature} have : {train_df[feature].isna().sum()}")

    percentage_missing = (train_df[feature].isna().sum()/len(train_df) * 100)
    print(f"{feature}  have {percentage_missing} % missing")
    if percentage_missing > 80 :
        train_df = train_df.drop(columns=[feature], axis=1)
        test_df = test_df.drop(columns=[feature], axis=1)
missing_value_features = [feature for feature in train_df if train_df[feature].isna().sum() > 0]

LotFrontage  have 17.73972602739726 % missing
MasVnrType  have 59.726027397260275 % missing
MasVnrArea  have 0.547945205479452 % missing
BsmtQual  have 2.5342465753424657 % missing
BsmtCond  have 2.5342465753424657 % missing
BsmtExposure  have 2.6027397260273974 % missing
BsmtFinType1  have 2.5342465753424657 % missing
BsmtFinType2  have 2.6027397260273974 % missing
Electrical  have 0.0684931506849315 % missing
FireplaceQu  have 47.26027397260274 % missing
GarageType  have 5.5479452054794525 % missing
GarageYrBlt  have 5.5479452054794525 % missing
GarageFinish  have 5.5479452054794525 % missing
GarageQual  have 5.5479452054794525 % missing
GarageCond  have 5.5479452054794525 % missing


In [61]:
train_df = train_df.drop(index=train_df[train_df['Electrical'].isna()].index)
test_df = test_df.drop(index=test_df[test_df['Electrical'].isna()].index)

In [62]:
train_df['MasVnrType'].unique()

array(['BrkFace', nan, 'Stone', 'BrkCmn'], dtype=object)

In [63]:
train_df['MasVnrType'] = train_df['MasVnrType'].fillna('No')
test_df['MasVnrType'] = test_df['MasVnrType'].fillna('No')


In [64]:
train_df

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,RL,62.0,7917,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,8,2007,WD,Normal,175000
1456,20,RL,85.0,13175,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2010,WD,Normal,210000
1457,70,RL,66.0,9042,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,2500,5,2010,WD,Normal,266500
1458,20,RL,68.0,9717,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,112,0,0,0,0,4,2010,WD,Normal,142125


In [65]:
train_df['LotFrontage']

0       65.0
1       80.0
2       68.0
3       60.0
4       84.0
        ... 
1455    62.0
1456    85.0
1457    66.0
1458    68.0
1459    75.0
Name: LotFrontage, Length: 1459, dtype: float64

In [66]:
# median = train_df['LotFrontage'].median()
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(0)
test_df['LotFrontage'] = test_df['LotFrontage'].fillna(0)


In [67]:
train_df['FireplaceQu'].unique()

array([nan, 'TA', 'Gd', 'Fa', 'Ex', 'Po'], dtype=object)

In [68]:
train_df['FireplaceQu'] = train_df['FireplaceQu'].fillna('None')
test_df['FireplaceQu'] = test_df['FireplaceQu'].fillna('None')

In [69]:
for feature in missing_value_features:
    if train_df[feature].isna().sum() > 0:
        print(f"{feature} have : {train_df[feature].isna().sum()}")

MasVnrArea have : 8
BsmtQual have : 37
BsmtCond have : 37
BsmtExposure have : 38
BsmtFinType1 have : 37
BsmtFinType2 have : 38
GarageType have : 81
GarageYrBlt have : 81
GarageFinish have : 81
GarageQual have : 81
GarageCond have : 81


In [70]:
missing_value_test_feature = [ feature for feature in test_df.isna()]
for feature in missing_value_test_feature:
    if test_df[feature].isna().sum() > 0:
        print(f"{feature} have : {test_df[feature].isna().sum()}")

MSZoning have : 4
Utilities have : 2
Exterior1st have : 1
Exterior2nd have : 1
MasVnrArea have : 15
BsmtQual have : 44
BsmtCond have : 45
BsmtExposure have : 44
BsmtFinType1 have : 42
BsmtFinSF1 have : 1
BsmtFinType2 have : 42
BsmtFinSF2 have : 1
BsmtUnfSF have : 1
TotalBsmtSF have : 1
BsmtFullBath have : 2
BsmtHalfBath have : 2
KitchenQual have : 1
Functional have : 2
GarageType have : 76
GarageYrBlt have : 78
GarageFinish have : 78
GarageCars have : 1
GarageArea have : 1
GarageQual have : 78
GarageCond have : 78
SaleType have : 1


In [71]:
y = train_df['SalePrice']

In [72]:
train_df = train_df.drop(columns=['SalePrice'])


In [73]:

for feature in missing_value_features:
    if train_df[feature].dtypes == 'O':
        train_df[feature] = train_df[feature].fillna('No')  
        train_df[feature] = train_df[feature].fillna('No') 
    else :
        median = train_df[feature].median()
        train_df[feature] = train_df[feature].fillna(median)
        test_df[feature] = test_df[feature].fillna(median)

In [74]:
train_df['NumYearBuild'] = train_df['YrSold'] - train_df['YearBuilt']
train_df['NumYearRemod'] = train_df['YrSold'] - train_df['YearRemodAdd']
train_df['NumYearGarage'] = train_df['YrSold'] - train_df['GarageYrBlt']

test_df['NumYearBuild'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['NumYearRemod'] = test_df['YrSold'] - test_df['YearRemodAdd']
test_df['NumYearGarage'] = test_df['YrSold'] - test_df['GarageYrBlt']


In [75]:
numeric_features = [ feature for feature in train_df if train_df[feature].dtype != 'O']
categorical_feature = [feature for feature in train_df if feature not in numeric_features]
print(len(numeric_features))
print(len(categorical_feature))

year_features = [feature for feature in numeric_features if "Year" in feature or "Yr" in feature]
discrete_numerical_feature = [feature for feature in numeric_features if len((train_df[feature]).unique()) < 25 and feature not in year_features and feature not in ['Id','SalePrice']]
continous_feature = [feature for feature in numeric_features if feature not in discrete_numerical_feature+year_features and feature not in ['Id','SalePrice']]

39
39


In [76]:
def normalizing(df,features):
    for feature in features:
        if 0 in df[feature]:
            pass
        else:
            df[feature] = np.log(df[feature])
    return df

In [77]:
train_df = normalizing(train_df,continous_feature)
test_df = normalizing(test_df,continous_feature)

In [78]:
y = np.log(y)

In [79]:
num_pipeline = Pipeline(steps=[
    ('StandardScaler',StandardScaler())
])

In [81]:
cat_pipeline = Pipeline(steps=[
    ('OneHotEncoder',OneHotEncoder())
])

In [82]:
column_transformer = ColumnTransformer([
    ('numerical transform',num_pipeline, numeric_features),
    ('categorical transform',cat_pipeline, categorical_feature)
])

In [83]:
pipeline = Pipeline(steps=[
    ('preprocessing',column_transformer)
])

In [28]:
pipeline

In [84]:
X = train_df


In [85]:
X_preprocessed = pipeline.fit_transform(X)
test_preprocessed = pipeline.fit_transform(test_df)

In [86]:
# separate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed,y,test_size=0.2,random_state=25)
X_train.shape, X_test.shape

((1167, 287), (292, 287))

In [87]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [88]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}

In [None]:
params = {
   "XGBRegressor" : {'colsample_bytree': 0.9,
        'gamma': 0.0,
        'learning_rate': 0.1,
        'max_depth': 3,
        'min_child_weight': 2,
        'n_estimators': 300,
        'subsample': 0.8}

}
    

In [92]:
model_list = []
r2_list =[]
rmse_list=[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    if model
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    model_list.append(list(models.keys())[i])
    r2_list.append(model_test_r2)
    rmse_list.append(model_test_rmse)
    

In [93]:
pd.DataFrame(list(zip(model_list, rmse_list, r2_list,)), columns=['Model Name','RMSE', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)


Unnamed: 0,Model Name,RMSE,R2_Score
6,XGBRegressor,0.145176,0.865083
5,Random Forest Regressor,0.148443,0.858942
7,AdaBoost Regressor,0.172971,0.808477
3,K-Neighbors Regressor,0.182311,0.787233
2,Ridge,0.194599,0.757587
0,Linear Regression,0.203548,0.734778
4,Decision Tree,0.204572,0.732102
1,Lasso,0.397356,-0.010734



Model Name	RMSE	R2_Score
5	Random Forest Regressor	0.145030	0.865355
6	XGBRegressor	0.145176	0.865083
7	AdaBoost Regressor	0.174742	0.804534
3	K-Neighbors Regressor	0.182311	0.787233
2	Ridge	0.194599	0.757587
4	Decision Tree	0.198383	0.748066
0	Linear Regression	0.203548	0.734778
1	Lasso	0.397356	-0.010734

In [36]:
XGB= XGBRegressor()

In [37]:
params_XGB = {
    'learning_rate' : [0.1, 0.2, 0.05],
    'n_estimators': [300],
    'max_depth' : [3],
    'min_child_weight' : [1,2,3],
    'gamma' : [0.0, 0.1, 0.2],
    'subsample' : [0.8, 0.9, 1.0],
    'colsample_bytree' : [0.8, 0.9, 1.0]

}

In [38]:
xgb_cv = GridSearchCV(XGB,params_XGB)

In [39]:
xgb_cv.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
xgb_cv.best_params_

{'colsample_bytree': 0.9,
 'gamma': 0.0,
 'learning_rate': 0.1,
 'max_depth': 3,
 'min_child_weight': 2,
 'n_estimators': 300,
 'subsample': 0.8}