## Author : Ibrahim Sobh
### Kaggle House Prices - Advanced Regression Technique.
- Predict sales prices and practice feature engineering, RFs, and gradient boosting

# Model building

# I - First Section :  Model Training

## 1 - Importing Libraries & Loading Data

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Sklearn 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error,r2_score
from sklearn.preprocessing import OrdinalEncoder,StandardScaler


# Load Data

data_master=pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
data = data_master.copy()

list_of_features =["OverallQual","GrLivArea","GarageCars","TotalBsmtSF","1stFlrSF",
                  "FullBath","YearBuilt","YearRemodAdd","BsmtFinSF1","Foundation",
                  "LotFrontage","WoodDeckSF","MasVnrArea","Fireplaces",
                  "ExterQual","BsmtQual","KitchenQual","GarageFinish","GarageType","HeatingQC"]

## 2 - Display Raw Data

In [2]:
data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## 3 - Split Data into Train | Test | Validation

In [3]:
## Differentiate between Features & Target (Sale Price)

X = data.loc[:, data.columns != 'SalePrice']
y = data.SalePrice

# Decide test and validation size

test_size =0.20

validation_size=0.20

# Split Train / Test

X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=1-test_size,random_state = 42)

# Split Train / Validation

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=1-validation_size, random_state=42)

print('train:', len(X_train), 'test:', len(X_test),'validation:', len(X_val))

train: 934 test: 292 validation: 234


## 4 - Preprocessing


## 4.1 - Preprocessing:  Check up for columns with missing Data

In [4]:
# unwanted Columns 

unwanted_columns = ["PoolQC", "MiscFeature","Alley","Fence","FireplaceQu"]

def drop_unwanted_columns(data, columns_list):
    return data.drop(columns_list, axis = 1)

# Drop X_train unwanted columns
X_train=drop_unwanted_columns(X_train,unwanted_columns)

# Drop X_val unwanted columns
X_val=drop_unwanted_columns(X_val,unwanted_columns)

## 4.2 - Preprocessing:  Encode Categorical Features 

In [5]:
ordinal = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=np.nan)

def encode_categorical_features(encoder, data,is_test=False):
    data_categorical = data.select_dtypes(include=['object']).columns
    if not is_test :
        encoder.fit(data[data_categorical])
    data[data_categorical]=encoder.transform(data[data_categorical])
    return data
    
# X_Train Categories Encoding
X_train = encode_categorical_features(ordinal,X_train,False)

# X_val Categories Encoding
X_val = encode_categorical_features(ordinal,X_val,True)

## 4.3 - Preprocessing:   Fill Features with Null /NA values

In [6]:
def fill_features_nulls(data):
    data_numerical= data.select_dtypes([np.int64,np.float64]).columns
    data_categorical = data.select_dtypes(include=['object']).columns
    data[data_numerical]=data[data_numerical].fillna(data[data_numerical].mean())
    for feature in data_categorical:
        data[feature].interpolate(method ='linear', limit_direction ='forward', inplace=True)
        data[feature].interpolate(method ='linear', limit_direction ='backward',inplace=True)
    return data

# X_Train Use 'fillna' & interpolate (forward /backward) fill missing values

X_train = fill_features_nulls(X_train)

# X_val Use 'fillna' & interpolate (forward /backward) fill missing values

X_val= fill_features_nulls(X_val)


## 5.0 -  Features Selection

In [7]:
X = X_train[list_of_features]
y = y_train

## 6 -  Scale Data ( Standard Scaler )

In [8]:
scalar =StandardScaler()

def scale_data(scalar,data):
    return pd.DataFrame(scalar.fit_transform(data),columns = data.columns)

# Scale X_train 

X_train= scale_data(scalar,X_train)

# Scale X_val
X_val=scale_data(scalar,X_val)

In [9]:
# For Metrics and Exporting Data

def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)

def export_Submission_file(predicition_dataframe,output_file_name):
    Output = pd.DataFrame()
    Output['Id'] = predicition_dataframe['Id']
    Output['SalePrice'] = predicition_dataframe['y_pred']
    Output.to_csv("../"+output_file_name+".txt", index=False)

## 7.0 - Model Training 

## 7.1- Model Training :  Linear Regression [ First Model ]

In [10]:
LR_model = LinearRegression()

X = X_train[list_of_features]
y = y_train

LR_model.fit(X, y)

y_pred=LR_model.predict(X)

y_pred=y_pred.ravel()
y_pred =abs(y_pred)
y= y.ravel()

print("Training Set ")
print("Root-Mean-Squared-Error :",compute_rmsle(y,y_pred,3))
print("R2 :",round(r2_score(y,y_pred),4))

Training Set 
Root-Mean-Squared-Error : 0.164
R2 : 0.7931


## 7.2- Model Training :  Model Validation on Validation Set  [ First Model ]

In [11]:
X = X_val[list_of_features]
y = y_val

y_pred=LR_model.predict(X)

y_pred=y_pred.ravel()
y_pred =abs(y_pred)
y= y.ravel()

print("Validation Set ")
print("Root-Mean-Squared-Error :",compute_rmsle(y,y_pred,3))
print("R2 :",round(r2_score(y,y_pred),4))

Validation Set 
Root-Mean-Squared-Error : 0.164
R2 : 0.8467


# II - Second Section :  Model Evaluation

## 1.0 - Preprocessing

In [12]:
# X_test Categories Encoding
X_test = drop_unwanted_columns(X_test,unwanted_columns)
X_test = encode_categorical_features(ordinal,X_test)
X_test = fill_features_nulls(X_test)

## 2 -  Scale Data ( Standard Scaler )

In [13]:
scalar =StandardScaler()
X_test= scale_data(scalar,X_test)

## 3.0 - Model Evaluation

## 3.1 - Model Evaluation :  Model Validation on Validation Set  [ First Model ]

In [14]:
X = X_test[list_of_features]
y = y_test

y_pred=LR_model.predict(X)

y_pred=y_pred.ravel()
y_pred =abs(y_pred)
y= y.ravel()

print("Validation Set ")
print("Root-Mean-Squared-Error :",compute_rmsle(y,y_pred,3))
print("R2 :",round(r2_score(y,y_pred),4))

Validation Set 
Root-Mean-Squared-Error : 0.185
R2 : 0.8218


## 3.2 - Model Evaluation : Random Forest Regressor [ Second Model ]

# Model inference

## Reading File

In [15]:
# Load Data
test_master=pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')
test_data = test_master.copy()

In [16]:
data.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Preprocessing 

In [17]:
test_data = drop_unwanted_columns(test_data,unwanted_columns)
test_data = encode_categorical_features(ordinal,test_data,True)
test_data = fill_features_nulls(test_data)

scalar =StandardScaler()
test_data= scale_data(scalar,test_data)

## Predictions

In [18]:
X = test_data[list_of_features]
y_pred=LR_model.predict(X)
test_master["SalePrice"] =y_pred

In [19]:
test_master[["Id","SalePrice"]].head(5)

Unnamed: 0,Id,SalePrice
0,1461,103930.704383
1,1462,161099.915619
2,1463,183829.371603
3,1464,198660.524909
4,1465,206494.032981
