## Author : Ibrahim Sobh
### Kaggle House Prices - Advanced Regression Technique.
- Predict sales prices and practice feature engineering, RFs, and gradient boosting

# Model building

# I - First Section :  Model Training

## 1 - Importing Libraries & Loading Data

In [46]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd


# Sklearn 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error,r2_score
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

# jobLib 
from joblib import dump,load


# Load Data

data_master=pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
data = data_master.copy()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## 2 - Split Data into Train | Test | Validation

In [33]:
def data_split_test_train_validation(data: pd.DataFrame, test_size:int =0.2,
                                     validation_size:int =0.2) -> pd.DataFrame:
    # Split Train / Test
    X = data.loc[:, data.columns != 'SalePrice']
    y = data.SalePrice
    
    #First Split L between Train and Test 
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        train_size= 1 - test_size,
                                                        random_state = 42)
    #Second Split :between Train and Validation 
    X_train, X_validation,y_train, y_validation = train_test_split(X_train, y_train,
                                                                   train_size= 1 - validation_size,
                                                                   random_state = 42)
    # return all splitted data sets ( 6 sets )
    return X_train, X_test,X_validation,y_train, y_test,y_validation


## 3 - Preprocessing


## 3.1 - Preprocessing:  Check up for columns with missing Data

In [47]:
def drop_unwanted_columns(data: pd.DataFrame, to_remove_columns: list=[]) -> pd.DataFrame:
    return data.drop(to_remove_columns, axis = 1)

## 3.2 - Preprocessing:  Encode Categorical Features 

In [35]:
def encode_categorical_features(encoder,data: pd.DataFrame,is_test:bool =False) -> pd.DataFrame:
    data_categorical = data.select_dtypes(include=['object']).columns
    if not is_test :
        filename="../models/encoder.joblib" 
        encoder.fit(data[data_categorical])
        dump(encoder ,filename)
    data[data_categorical]=encoder.transform(data[data_categorical])
    return data

## 3.3 - Preprocessing:   Fill Features with Null /NA values

In [36]:
def fill_features_nulls(data: pd.DataFrame) -> pd.DataFrame:
    
    data_numerical= data.select_dtypes([np.int64,np.float64]).columns
    data_categorical = data.select_dtypes(include=['object']).columns

    data[data_numerical]=data[data_numerical].fillna(data[data_numerical].mean())
    
    for feature in data_categorical:
        data[feature].interpolate(method ='linear', limit_direction ='forward', inplace=True)
        data[feature].interpolate(method ='linear', limit_direction ='backward',inplace=True)
        
    return data

## 3.4 - Preprocessing:   Scale Data ( Standard Scaler )

In [37]:
def scale_data(scalar,data: pd.DataFrame,is_test:bool =False) -> pd.DataFrame:
    if not is_test:
        scalar.fit(data)
        filename="../models/scalar.joblib" 
        dump(scalar ,filename)
    return pd.DataFrame(scalar.transform(data),columns = data.columns)

## All data Preprocessing 

In [48]:
def data_preprocessing(data: pd.DataFrame,encoder,scalar,is_test:bool=False) -> pd.DataFrame:  
   
    # Carefully Selected Features ( after analysis)
    list_of_features =["OverallQual","GrLivArea","GarageCars","TotalBsmtSF","1stFlrSF",
                       "FullBath","YearBuilt","YearRemodAdd","BsmtFinSF1","Foundation",
                       "LotFrontage","WoodDeckSF","MasVnrArea","Fireplaces",
                       "ExterQual","BsmtQual","KitchenQual","GarageFinish",
                       "GarageType","HeatingQC"]

    # To remove Columns 
    unwanted_columns = ["PoolQC", "MiscFeature","Alley","Fence","FireplaceQu"]
    
    data= drop_unwanted_columns(data,to_remove_columns=unwanted_columns)
    
    data =data[list_of_features]
    
    data= encode_categorical_features(encoder,data,is_test)
    
    data= fill_features_nulls(data)
    
    data= scale_data(scalar,data,is_test)
    
    return data

In [39]:
# For Metrics and Exporting Data

def compute_rmsle(y_true: np.ndarray, y_pred: np.ndarray, precision: int = 3) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
    return round(rmsle, precision)

def evaluate_performance(y_pred:np.ndarray,y_true: np.ndarray, precision: int = 2,
                         comment:str = "") -> dict[str, str]: 
    y_pred=y_pred.ravel()
    y_pred =abs(y_pred)
    
    y_true= y_true.ravel()
   
    rmse =compute_rmsle(y_true,y_pred,precision)
    key =comment+"_rmse"
    
    return  dict({key : rmse})

# Model building & Evaluation

In [40]:
def build_model(data: pd.DataFrame) -> dict[str, str]:
    
    # split data into Train, Test, and Validation
    X_train, X_test,X_validation,y_train, y_test,y_validation=data_split_test_train_validation(data)
    
    #Create Encoder
    ordinal = OrdinalEncoder(handle_unknown="use_encoded_value",unknown_value=np.nan)
   
    #Create Scalar
    scalar =StandardScaler()
    
    #Preprocessing(cleaning data and training encoders,scalars)
    X_train= data_preprocessing(X_train,encoder=ordinal,scalar=scalar,is_test=False)
    
    #Preprocessing(cleaning data and using trained encoders,scalars)
    X_validation=data_preprocessing(X_validation,encoder=ordinal,scalar=scalar,is_test=True)
    
    #Define an evaluation dictonary
    evaluations_dict= dict()

    #Defining the Machine Learning model 
    LR_model = LinearRegression()
    
    #Train model
    LR_model.fit(X_train, y_train)
    model_filename="../models/model.joblib" 
    dump(LR_model ,model_filename)
    
    #Validation-set evaluation
    y_validation_predictions=LR_model.predict(X_validation)
    validation_evaluation= evaluate_performance(y_pred=y_validation_predictions,y_true=y_validation,
                                                precision=3,comment="Validation")
    evaluations_dict.update(validation_evaluation)
    
    # Model Build Evalution on Testing Set 
    #-------------------------------------
    #Preprocessing(cleaning data and using trained encoders,scalars)
    X_test=data_preprocessing(X_test,encoder=ordinal,scalar=scalar,is_test=True)
    
    #Testing-set evaluation
    y_test_predictions=LR_model.predict(X_test)
    test_evaluation= evaluate_performance(y_pred=y_test_predictions,y_true=y_test,
                                                precision=3,comment="Test")
    evaluations_dict.update(test_evaluation)
    # Returns a dictionary with the model performances (for example {'rmse': 0.18})
    return evaluations_dict

In [49]:
# Build Model 
evaluations= build_model(data)
print(evaluations)

{'Validation_rmse': 0.163, 'Test_rmse': 0.186}


# Model inference

## Reading File

In [50]:
# Load Data
test_master=pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')
test_data = test_master.copy()

In [51]:
def make_predictions(input_data: pd.DataFrame) -> np.ndarray:
    
    # load the encoder 
    encoder_filename="../models/encoder.joblib" 
    ordinal =load(encoder_filename)

    # load the scalar
    scalar_filename="../models/scalar.joblib" 
    scalar=load(scalar_filename)

    # load the model
    model_filename="../models/model.joblib" 
    model= load(model_filename)
    
    input_data=data_preprocessing(input_data,encoder=ordinal,scalar=scalar,is_test=True)
    
    #Validation-set evaluation
    y_predictions=model.predict(input_data)
    
    return y_predictions

In [52]:
predicitons =make_predictions(test_master)
print(predicitons)

[101484.43003764 157864.03395506 180538.79709244 ... 160823.63999707
 103964.05876926 236273.53828862]
