`Steps`

1 .  Explore and Clean the Data

2 . Split the data into training , validation and testing

3 . Fit the initial model and evaluate

4 .  Tune hyperparameters

5 . Evaluate on a validation set

6 . Finalize the model selection and evaluate on the test set

In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import joblib


In [39]:
df = pd.read_csv("train.csv")
df.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


# **Data Cleaning**


In [40]:
# splitting data into x and y 
x = df.drop("SalePrice" , axis=1)
y = df.SalePrice


In [41]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_features = x.columns.tolist()
one_hot = OneHotEncoder(sparse_output=False)
transformer = ColumnTransformer([('one_hot' ,
                                  one_hot ,
                                  categorical_features)], 
                                  remainder='passthrough')

x_transformed = transformer.fit_transform(x)
x_transformed

array([[1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [0., 0., 1., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.]])

In [42]:
x_transformed = pd.DataFrame(x_transformed)
x_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9097,9098,9099,9100,9101,9102,9103,9104,9105,9106
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [43]:
x_transformed.isna().sum()

0       0
1       0
2       0
3       0
4       0
       ..
9102    0
9103    0
9104    0
9105    0
9106    0
Length: 9107, dtype: int64

In [44]:
#splitting x and y in training and test set
x_train, x_test ,y_train , y_test = train_test_split(x_transformed , y, 
                                                      test_size= 0.4,
                                                      random_state = 42)

# splitting test set into test and validation
x_val, x_test ,y_val , y_test = train_test_split(x_test , y_test, 
                                                      test_size= 0.5,
                                                      random_state = 42)

In [45]:
#Assign each a dataframe structure
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
x_val = pd.DataFrame(x_val)
    

In [46]:
x_val.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9097,9098,9099,9100,9101,9102,9103,9104,9105,9106
1336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
619,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Choose the estimator / algorithm for the problem

# **Boosting : Explore Boosting algorithms in Python**

**Import GradientBoostingRegressor and AdaBoostRegressor from sklearn and explore the hyperparameters**

## **Import Boosting Algorithm for Regression**

In [47]:
from sklearn.ensemble import GradientBoostingRegressor , AdaBoostRegressor

# GradientBoostingRegressor().get_params()

In [48]:
# AdaBoostRegressor().get_params()

## Hyperparameter Tuning

In [49]:
from sklearn.model_selection import GridSearchCV , RandomizedSearchCV
np.random.seed(42)

gb = GradientBoostingRegressor()
parameters = {
    "n_estimators" : np.arange(50, 2000, 50) ,
    "max_depth" : np.arange(3,20, 3 ),
    "learning_rate" : [0.01, 0.1, 1 , 10]
}

cv = RandomizedSearchCV(gb , 
                        param_distributions=parameters , 
                        cv = 5 ,
                        n_iter = 100 ,
                        verbose = 1)

cv.fit(x_train , y_train.values.ravel())

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
# create a model with the best hyperparamaters

def best_parameters():
    # Get the best parameters from the search
    best_grid = cv.best_estimator_ 
    
    # Instantiate the model and Unpack the best_parameters dictionary into the RandomForestRegressor constructor (** means unpacking the best parameters)
    model = GradientBoostingRegressor(**best_grid)
    
    #Fitting the model
    model.fit(x_train,y_train.values.ravel())
    
    return model

gb_model = best_parameters()

In [None]:
gb_model

In [None]:
# save and load the model
from joblib import load , dump

dump(gb_model, filename='gradient.joblib')
gb_model = load("gradient.joblib")
gb_model

# **Bagging : Implement a bagging model**

**In this section , we will fit and evaluate a simple Random Forest Model**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs= -1 , random_state=42 )
parameters = {
    "n_estimators" : np.arange(50, 2000, 50),
    "max_depth" : np.arange(3, 8 , 9 , 20, None) ,
    "max_features" : [None, "sqrt", "log2", 1, 1.0,] ,
    'min_samples_split'  : [15 , 20],
    "min_weight_fraction_leaf" : np.arange(0.0, 0.5, 11) ,
    "oob_score" : [False] ,
    "warm_start": [True]
}

cv = RandomizedSearchCV(rf ,
                  param_distributions = parameters ,
                  n_iter= 100
                  cv = 5)

cv.fit(x_train , y_train.values.ravel())

In [None]:
# create a model with the best hyperparamaters

def best_parameters():
    # Get the best parameters from the search
    best_grid = cv.best_params_ 
    
    # Instantiate the model and Unpack the best_parameters dictionary into the RandomForestRegressor constructor (** means unpacking the best parameters)
    model = RandomForestRegressor(**best_grid)
    
    #Fitting the model
    model.fit(x_train,y_train.values.ravel())
    
    return model

rf_model = best_parameters()

In [None]:
# save and load the model
from joblib import load , dump

dump(rf_model , filename='randomforest.joblib')
rf_model = load("randomforest.joblib")
rf_model

# **Stacking : Explore stacking algorithms in Python**

**Import `StackingClassifier` from `sklearn` and explore the hyperparameters**

**Import Stacking Algorithm For Regression**

In [None]:
from sklearn.ensemble import StackingRegressor, GradientBoostingClassifier, RandomForestClassifier

estimators = [('gb', GradientBoostingClassifier()) , ('rf' , RandomForestClassifier())]

StackingRegressor(estimators = estimators).get_params()

# **Evaluation with all  Regression Metrics**

**Write a function with every metric for evaluatuion**

In [None]:
from sklearn.metrics import mean_absolute_error , mean_squared_error , mean_squared_log_error , r2_score 

# Function that calculates root mean sqaured error
# def rmse(y_train , y_preds):
    
#     "calculates the root mean squared error "
    
#     return np.sqrt(mean_squared_error(y_train, y_preds))


#Function that calculates root mean squared log error
# def rmsle(y_train , y_preds):
#     "calculates the root mean squared log error"
    
#     return np.sqrt(mean_squared_log_error(y_train , y_preds))



# Function for all regression metrics
def all_metrics(model):
    y_train_preds = model.predict(x_train)
    y_val_preds = model.predict(y_val)
    
    scores = { 
             # Training Metrics
             "Trained MSE" : mean_absolute_error(y_train , y_train_preds) ,
             "Trained MSE" : mean_squared_error(y_train , y_train_preds) , 
             "Trained MSLE" : mean_squared_log_error(y_train , y_train_preds) , 
             "Trained R^2" : r2_score(y_train , y_train_preds) ,
             "Trained RMSE" : np.sqrt(mean_squared_error(y_train , y_train_preds)),
             "Trained RMSLE" : np.sqrt(mean_squared_log_error(y_train , y_train_preds)),
          
             
             # Validation metrics
             
             "Validation MSE" : mean_absolute_error(y_val , y_val_preds) ,
             "Validation MSE" : mean_squared_error(y_val , y_val_preds) , 
             "Validation MSLE" : mean_squared_log_error(y_val , y_val_preds) , 
             "Validation R^2" : r2_score(y_val , y_val_preds) ,
             "Validation RMSE" : np.sqrt(mean_squared_error(y_val , y_val_preds)),
             "Validation RMSLE" : np.sqrt(mean_squared_log_error(y_val , y_val_preds)),
            }
   
    return scores
    
    
    