`Steps`

1 .  Explore and Clean the Data

2 . Split the data into training , validation and testing

3 . Fit the initial model and evaluate

4 .  Tune hyperparameters

5 . Evaluate on a validation set

6 . Finalize the model selection and evaluate on the test set

In [54]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import joblib
import warnings
# Filter out warnings
warnings.filterwarnings("ignore")



In [55]:
df = pd.read_csv("train.csv")
df.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


# **Data Cleaning**


# Cleaning Numeric Column 

In [56]:
# Ensure 'float64' is in lowercase and fix inplace usage
df_numeric = df.select_dtypes(include=["int64", "float64"])

# Fill missing values in numeric columns with the median
df_numeric = df_numeric.fillna(df_numeric.median())

# If you want to apply the changes back to your original DataFrame:
df.update(df_numeric)


# **Clean object Column by turning it into category and used .codes to turn it into numbers**

In [57]:
df_object = df.select_dtypes(include='object')

# Convert object columns to categorical, then into numeric codes
for key in df_object.columns:
    df[key] = pd.Categorical(df_object[key]).codes+1 # Adding 1 to avoid -1 for 'Unknown'

# Update the original dataframe with the encoded values 
df.update(df)
    

In [58]:
if df.isna().sum().any() > 0:
    print("There is still null values in the Dataset")
else:
    print("No null Values in the dataset")

No null Values in the dataset


# **Split the data into training , validation and testing**

In [59]:
# splitting data into x and y 
x = df.drop("SalePrice" , axis=1)
y = df.SalePrice


In [60]:
#splitting x and y in training and test set
x_train, x_test ,y_train , y_test = train_test_split(x , y, 
                                                      test_size= 0.4,
                                                      random_state = 42)

# splitting test set into test and validation
x_val, x_test ,y_val , y_test = train_test_split(x_test , y_test, 
                                                      test_size= 0.5,
                                                      random_state = 42)

In [61]:
x_train.dtypes.value_counts()

int8       43
int64      34
float64     3
Name: count, dtype: int64

# **Fit the initial model and evaluate**

In [62]:
#Fitting initial
from sklearn.ensemble import RandomForestRegressor

#Instantiating a model
rf_model = RandomForestRegressor(n_jobs = -1, 
                                 random_state= 42)

# Fitting the model
rf_model.fit(x_train , y_train)

In [95]:
# function with every regression metric for evaluatuion and model performance

from sklearn.metrics import mean_absolute_error , mean_squared_error , mean_squared_log_error , r2_score 

# Function that calculates root mean sqaured error
def rmse(y_train , y_preds):
    
    "calculates the root mean squared error "
    
    return np.sqrt(mean_squared_error(y_train, y_preds))


# Function that calculates root mean squared log error
def rmsle(y_train , y_preds):
    "calculates the root mean squared log error"
    
    return np.sqrt(mean_squared_log_error(y_train , y_preds))



# Function for all regression metrics
def all_metrics(model):
    y_train_preds = model.predict(x_train)
    y_val_preds = model.predict(x_val)
    
    scores = { 
             # Training Metrics
        "Training Metrics" : {
             "Trained MSE" : float(mean_absolute_error(y_train , y_train_preds)) ,
             "Trained MSE" : float(mean_squared_error(y_train , y_train_preds)) , 
             "Trained MSLE" : float(mean_squared_log_error(y_train , y_train_preds)) , 
             "Trained R^2" : round(float(r2_score(y_train , y_train_preds) * 100) , 2),
             "Trained RMSE" : float(rmse(y_train , y_train_preds)),
             "Trained RMSLE" : float(rmsle(y_train , y_train_preds)),
             
        } ,
          
              
# The presence of np.float64 in the output is due to the fact that numpy uses this specific type for numerical values. 
# However, when displaying the output, it makes the type explicit, which you don’t necessarily need.
# To remove this and display the results as regular floating-point numbers, 
# you can convert the np.float64 values to Python native floats using the float() function. 

          
             # Validation metrics
        "Validation Metrics" : {     
             "Validation MSE" : float(mean_absolute_error(y_val , y_val_preds)) ,
             "Validation MSE" : float(mean_squared_error(y_val , y_val_preds)) , 
             "Validation MSLE" : float(mean_squared_log_error(y_val , y_val_preds)) , 
             "Validation R^2" : round(float(r2_score(y_val , y_val_preds) * 100) , 2),
             "Validation RMSE" : float(rmse(y_val , y_val_preds)),
             "Validation RMSLE" : float(rmsle(y_val , y_val_preds)),
            }
    }
    
    # Display the metrics clearly
    print("Training Metrics:")
    for key, value in scores["Training Metrics"].items():
        print(f"{key}: {value}")
    
    print("\nValidation Metrics:")
    for key, value in scores["Validation Metrics"].items():
        print(f"{key}: {value}")
    
    
    
    
    

In [96]:
all_metrics(rf_model)

Training Metrics:
Trained MSE: 137234767.6708845
Trained MSLE: 0.003935487436800935
Trained R^2: 97.58
Trained RMSE: 11714.724395856887
Trained RMSLE: 0.06273346345293662

Validation Metrics:
Validation MSE: 1279279168.5928566
Validation MSLE: 0.024737738943255455
Validation R^2: 86.42
Validation RMSE: 35767.01229614876
Validation RMSLE: 0.15728235420178405


# **Tune hyperparameters**

## Choose the estimator / algorithm for the problem

## Boosting : Explore Boosting algorithms in Python

Import GradientBoostingRegressor and AdaBoostRegressor from sklearn and explore the hyperparameters

Import Boosting Algorithm for Regression

In [67]:
from sklearn.ensemble import GradientBoostingRegressor , AdaBoostRegressor

# GradientBoostingRegressor().get_params()

In [68]:
# AdaBoostRegressor().get_params()

In [69]:
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
np.random.seed(42)

gb = GradientBoostingRegressor()
parameters = {
    "n_estimators" : np.arange(50, 2000, 50) ,
    "max_depth" : np.arange(3,20, 3 ),
    "learning_rate" : [0.01, 0.1, 1 , 10]
}

cv = RandomizedSearchCV(gb , 
                        param_distributions=parameters , 
                        cv = 5 ,
                        n_iter = 1 ,
                        verbose = 1)

cv.fit(x_train , y_train)

In [70]:
# create a model with the best hyperparamaters

def best_parameters():
    # Get the best parameters from the search
    best_grid = cv.best_estimator_ 
    
    # Instantiate the model and Unpack the best_parameters dictionary into the RandomForestRegressor constructor (** means unpacking the best parameters)
    model = GradientBoostingRegressor(**best_grid)
    
    #Fitting the model
    model.fit(x_train,y_train)
    
    return model

gb_model = best_parameters()

In [71]:
gb_model

In [72]:
# save and load the model
from joblib import load , dump

dump(gb_model, filename='gradient.joblib')
gb_model = load("gradient.joblib")
gb_model

## Bagging : Implement a bagging model

In this section , we will fit and evaluate a simple Random Forest Model

In [73]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs= -1 , random_state=42 )
parameters = {
    "n_estimators" : np.arange(50, 2000, 50),
    "max_depth" : np.arange(3, 8 , 9 , 20, None) ,
    "max_features" : [None, "sqrt", "log2", 1, 1.0,] ,
    'min_samples_split'  : [15 , 20],
    "min_weight_fraction_leaf" : np.arange(0.0, 0.5, 11) ,
    "oob_score" : [False] ,
    "warm_start": [True]
}

cv = RandomizedSearchCV(rf ,
                  param_distributions = parameters ,
                  n_iter= 5
                  cv = 5 ,
                  verbose = 1)

cv.fit(x_train , y_train)

In [74]:
# create a model with the best hyperparamaters

def best_parameters():
    # Get the best parameters from the search
    best_grid = cv.best_params_ 
    
    # Instantiate the model and Unpack the best_parameters dictionary into the RandomForestRegressor constructor (** means unpacking the best parameters)
    model = RandomForestRegressor(**best_grid)
    
    #Fitting the model
    model.fit(x_train,y_train)
    
    return model

rf_model = best_parameters()

In [None]:
rf_model

In [75]:
# save and load the model
from joblib import load , dump

dump(rf_model , filename='randomforest.joblib')
rf_model = load("randomforest.joblib")
rf_model

## Stacking : Explore stacking algorithms in Python

Import `StackingClassifier` from `sklearn` and explore the hyperparameters

Import Stacking Algorithm For Regression

In [76]:
from sklearn.ensemble import StackingRegressor, GradientBoostingClassifier, RandomForestClassifier

estimators = [('gb', GradientBoostingClassifier()) , 
              ('rf' , RandomForestClassifier())]

sc = StackingRegressor(estimators = estimators)

In [None]:
parameters = {
    'gb_n_estimators' : [] ,
    'rf_n_estimators' : [],
    'final_estimator' : [],
    'passthrough' : []
}

cv = RandomizedSearchCV(sc , 
                        parameters , 
                        cv = 5)

cv.fit(x_train, y_train)

In [None]:
# create a model with the best hyperparamaters

def best_parameters():
    # Get the best parameters from the search
    best_grid = cv.best_params_ 
    
    # Instantiate the model and Unpack the best_parameters dictionary into the RandomForestRegressor constructor (** means unpacking the best parameters)
    model = RandomForestRegressor(**best_grid)
    
    #Fitting the model
    model.fit(x_train,y_train)
    
    return model

sk_model = best_parameters()

In [None]:
sk_model

In [None]:
# save and load the model
from joblib import load , dump

dump(sk_model , filename='stackingregressor.joblib')
sk_model = load("skackingregressor.joblib")
sk_model

# **Evaluate on the Validation set**



In [99]:
# function with every regression metric for evaluatuion and model performance

from sklearn.metrics import mean_absolute_error , mean_squared_error , mean_squared_log_error , r2_score 

# Function that calculates root mean sqaured error
def rmse(y_train , y_preds):
    
    "calculates the root mean squared error "
    
    return np.sqrt(mean_squared_error(y_train, y_preds))


# Function that calculates root mean squared log error
def rmsle(y_train , y_preds):
    "calculates the root mean squared log error"
    
    return np.sqrt(mean_squared_log_error(y_train , y_preds))



# Function for all regression metrics
def all_metrics(model):
    # y_train_preds = model.predict(x_train)
    y_val_preds_gb = gb_model.predict(x_val)
    y_val_preds_rf = rf_model.predict(x_val)
    y_val_preds_sk = sk_model.predict(x_val)

    
# The presence of np.float64 in the output is due to the fact that numpy uses this specific type for numerical values. 
# However, when displaying the output, it makes the type explicit, which you don’t necessarily need.
# To remove this and display the results as regular floating-point numbers, 
# you can convert the np.float64 values to Python native floats using the float() function.    
    
    scores = { 
             # Validation metrics
             "Validation Metrics (GradientBoostingRegressor)" : {     
             "Validation MSE" : float(mean_absolute_error(y_val ,y_val_preds_gb)) , 
             "Validation MSLE" : float(mean_squared_log_error(y_val , y_val_preds_gb)) , 
             "Validation R^2" : round(float(r2_score(y_val , y_val_preds_gb) * 100) , 2),
             "Validation RMSE" : float(rmse(y_val , y_val_preds_gb)),
             "Validation RMSLE" : float(rmsle(y_val , y_val_preds_gb)),
            } ,
        
        
           # Validation metrics
            "Validation Metrics (RandomForestRegressor)" : {     
             "Validation MSE" : float(mean_absolute_error(y_val , y_val_preds_rf)) ,
             "Validation MSE" : float(mean_squared_error(y_val ,y_val_preds_rf)) , 
             "Validation MSLE" : float(mean_squared_log_error(y_val , y_val_preds_rf)) , 
             "Validation R^2" : round(float(r2_score(y_val , y_val_preds_rf) * 100) , 2),
             "Validation RMSE" : float(rmse(y_val , y_val_preds_rf)),
             "Validation RMSLE" : float(rmsle(y_val , y_val_preds_rf)),
            } ,
        
           # Validation metrics
        "Validation Metrics(StackingRegressor)" : {     
             "Validation MSE" : float(mean_absolute_error(y_val , y_val_preds_sk)) ,
             "Validation MSE" : float(mean_squared_error(y_val , y_val_preds_sk)) , 
             "Validation MSLE" : float(mean_squared_log_error(y_val ,y_val_preds_sk)) , 
             "Validation R^2" : round(float(r2_score(y_val ,y_val_preds_sk) * 100) , 2),
             "Validation RMSE" : float(rmse(y_val , y_val_preds_sk)),
             "Validation RMSLE" : float(rmsle(y_val , y_val_preds_sk)),
            }
    }
    
    # Display the metrics clearly
    # print("Training Metrics:")
    # for key, value in scores["Training Metrics"].items():
    #     print(f"{key}: {value}")
    
    print("\nValidation Metrics(GradientBoostingRegressor):")
    for key, value in scores["Validation Metrics (GradientBoostingRegressor)"].items():
        print(f"{key}: {value}")
    
    print("\nValidation Metrics(RandomForestRegressor):")
    for key, value in scores["Validation Metrics (RandomForestRegressor)"].items():
        print(f"{key}: {value}")
        
        
    print("\nValidation Metrics(StackingRegressor):")
    for key, value in scores["Validation Metrics(StackingRegressor)"].items():
        print(f"{key}: {value}")
    
    
    
    
    
    
    
    
       #      # Training Metrics
        # "Training Metrics" : {
        #      "Trained MSE" : float(mean_absolute_error(y_train , y_train_preds)) ,
        #      "Trained MSE" : float(mean_squared_error(y_train , y_train_preds)) , 
        #      "Trained MSLE" : float(mean_squared_log_error(y_train , y_train_preds)) , 
        #      "Trained R^2" : round(float(r2_score(y_train , y_train_preds) * 100) , 2),
        #      "Trained RMSE" : float(rmse(y_train , y_train_preds)),
        #      "Trained RMSLE" : float(rmsle(y_train , y_train_preds)),
             
        # } ,