## Author : Ibrahim Sobh
### Kaggle House Prices - Advanced Regression Technique.
- Predict sales prices and practice feature engineering, RFs, and gradient boosting

# Model building

# I - First Section :  Model Training

## 1 - Importing Libraries & Loading Data

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd


# Sklearn 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error,r2_score
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

# jobLib 
from joblib import dump,load


# Load Data

data_master=pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
data = data_master.copy()

# Carefully Selected Features ( after analysis)
list_of_features =["OverallQual","GrLivArea","GarageCars","TotalBsmtSF","1stFlrSF",
                  "FullBath","YearBuilt","YearRemodAdd","BsmtFinSF1","Foundation",
                  "LotFrontage","WoodDeckSF","MasVnrArea","Fireplaces",
                  "ExterQual","BsmtQual","KitchenQual","GarageFinish","GarageType","HeatingQC"]

# unwanted Columns 
unwanted_columns = ["PoolQC", "MiscFeature","Alley","Fence","FireplaceQu"]

## 2 - Split Data into Train | Test | Validation

In [2]:

# from typing import Union

def data_split_test_train_validation(data: pd.DataFrame, test_size:int =0.2,
                                     validation_size:int =0.2) -> pd.DataFrame:
    # Split Train / Test
    X = data.loc[:, data.columns != 'SalePrice']
    y = data.SalePrice
    
    #First Split L between Train and Test 
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size= 1 - test_size,
                                                        random_state = 42)
    #Second Split :between Train and Validation 
    X_train, X_validation,y_train, y_validation = train_test_split(X_train, y_train,
                                                                   train_size= 1 - validation_size,
                                                                   random_state = 42)

    # return all splitted data sets ( 6 sets )
    return X_train, X_test,X_validation,y_train, y_test,y_validation
  
X_train, X_test,X_validation,y_train, y_test,y_validation=data_split_test_train_validation(data)

## 3 - Preprocessing


## 3.1 - Preprocessing:  Check up for columns with missing Data

In [3]:
def drop_unwanted_columns(data: pd.DataFrame, columns_list: list=[]) -> pd.DataFrame:
    return data.drop(columns_list, axis = 1)

# Drop X_train unwanted columns
X_train=drop_unwanted_columns(X_train,unwanted_columns)

# Drop X_validation unwanted columns
X_validation=drop_unwanted_columns(X_validation,unwanted_columns)


## 3.2 - Preprocessing:  Encode Categorical Features 

In [4]:
ordinal = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=np.nan)

def encode_categorical_features(encoder,data: pd.DataFrame,is_test:bool =False) -> pd.DataFrame:
    data_categorical = data.select_dtypes(include=['object']).columns
    if not is_test :
        filename="../models/encoder.joblib" 
        encoder.fit(data[data_categorical])
        dump(ordinal ,filename)
    data[data_categorical]=encoder.transform(data[data_categorical])
    return data
    
# X_Train Categories Encoding
X_train = encode_categorical_features(ordinal,X_train,False)


# X_validation Categories Encoding
X_validation = encode_categorical_features(ordinal,X_validation,True)

## 3.3 - Preprocessing:   Fill Features with Null /NA values

In [5]:

def fill_features_nulls(data: pd.DataFrame) -> pd.DataFrame:
    
    data_numerical= data.select_dtypes([np.int64,np.float64]).columns
    data_categorical = data.select_dtypes(include=['object']).columns

    data[data_numerical]=data[data_numerical].fillna(data[data_numerical].mean())
    
    for feature in data_categorical:
        data[feature].interpolate(method ='linear', limit_direction ='forward', inplace=True)
        data[feature].interpolate(method ='linear', limit_direction ='backward',inplace=True)
        
    return data

# X_Train Use 'fillna' & interpolate (forward /backward) fill missing values

X_train = fill_features_nulls(X_train)

# X_validation Use 'fillna' & interpolate (forward /backward) fill missing values

X_validation= fill_features_nulls(X_validation)

## 6 -  Scale Data ( Standard Scaler )

In [6]:
scalar =StandardScaler()

def scale_data(scalar,data: pd.DataFrame,is_test:bool =False) -> pd.DataFrame:
    if not is_test:
        scalar.fit(data)
        filename="../models/scalar.joblib" 
        dump(scalar ,filename)
    return pd.DataFrame(scalar.transform(data),columns = data.columns)

# Scale X_train 

X_train= scale_data(scalar,X_train,False)

# Scale X_validation
X_validation=scale_data(scalar,X_validation,True)

In [7]:
def data_preprocessing(data: pd.DataFrame,to_remove_columns,encoder,scalar) -> pd.DataFrame: 
    data= drop_unwanted_columns(data,to_remove_columns)
    
    data= encode_categorical_features(encoder,data,True)
    
    data= fill_features_nulls(data)
    
    data= scale_data(scalar,data,True)
    
    return data

In [8]:
# For Metrics and Exporting Data

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

## 7.0 - Model Training 

## 7.1- Model Training :  Linear Regression [ First Model ]

In [9]:
LR_model = LinearRegression()

X = X_train[list_of_features]
y = y_train

LR_model.fit(X, y)
y_pred=LR_model.predict(X)

y_pred=y_pred.ravel()
y_pred =abs(y_pred)
y= y.ravel()

print("Training Set ")
print("Root-Mean-Squared-Error :",compute_rmsle(y,y_pred,3))
print("R2 :",round(r2_score(y,y_pred),4))
filename="../models/model.joblib" 
dump(LR_model ,filename)
    

Training Set 
Root-Mean-Squared-Error : 0.164
R2 : 0.7931


['../models/model.joblib']

## 7.2- Model Training :  Model Validation on Validation Set  [ First Model ]

In [10]:
X = X_validation[list_of_features]
y = y_validation

y_pred=LR_model.predict(X)

y_pred=y_pred.ravel()
y_pred =abs(y_pred)
y= y.ravel()

print("Validation Set ")
print("Root-Mean-Squared-Error :",compute_rmsle(y,y_pred,3))
print("R2 :",round(r2_score(y,y_pred),4))

Validation Set 
Root-Mean-Squared-Error : 0.163
R2 : 0.8425


# II - Second Section :  Model Evaluation

## 1.0 - Preprocessing

In [11]:
# X_test Categories Encoding
X_test = drop_unwanted_columns(X_test,unwanted_columns)
X_test = encode_categorical_features(ordinal,X_test)
X_test = fill_features_nulls(X_test)

## 2 -  Scale Data ( Standard Scaler )

In [12]:
X_test= scale_data(scalar,X_test,True)

## 3.0 - Model Evaluation

In [13]:
X = X_test[list_of_features]
y = y_test

y_pred=LR_model.predict(X)

y_pred=y_pred.ravel()
y_pred =abs(y_pred)
y= y.ravel()

print("Validation Set ")
print("Root-Mean-Squared-Error :",compute_rmsle(y,y_pred,3))
print("R2 :",round(r2_score(y,y_pred),4))

Validation Set 
Root-Mean-Squared-Error : 0.186
R2 : 0.8231


# Model inference

## Reading File

In [14]:
# Load Data
test_master=pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')
test_data = test_master.copy()

In [15]:
data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Preprocessing 

In [16]:
# load the encoder 
filename="../models/encoder.joblib" 
ordinal_loaded =load(filename)

# load the scalar
filename="../models/scalar.joblib" 
scalar_loaded=load(filename)

# load the model
filename="../models/model.joblib" 

test_data= data_preprocessing(test_data,to_remove_columns=unwanted_columns,
                         encoder=ordinal_loaded,scalar=scalar_loaded)

## Predictions

In [17]:
X = test_data[list_of_features]

#y_pred=LR_model.predict(X)
filename="../models/model.joblib" 

# load the model from disk
loaded_model = load(filename)
y_pred= loaded_model.predict(X)

test_master["SalePrice"] =y_pred

In [18]:
test_master[["Id","SalePrice"]].head(5)

Unnamed: 0,Id,SalePrice
0,1461,103227.75485
1,1462,159607.358767
2,1463,181746.959687
3,1464,196601.597536
4,1465,207056.381022
