`Steps`

1 .  Explore and Clean the Data

2 . Split the data into training , validation and testing

3 .  Tune hyperparameters with RandomizedSearchCV

4 .  Use the best parameters 

5 . Evaluate on a validation set

6 . Finalize the model selection and evaluate on the test set

# **Explore and Clean Data**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [2]:
df = pd.read_csv("train.csv" , low_memory=False)
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [3]:
df.dtypes.value_counts()

object    9
int64     4
Name: count, dtype: int64

In [4]:
df.shape

(188533, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB


In [6]:
df.isna().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

## **Clean numeric data first**

In [7]:
df_numeric = df.select_dtypes(include='int64')
df_numeric = df_numeric.fillna(df_numeric.median())
df.update(df_numeric)


In [8]:
df.isna().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64

# **Clean object Column by turning it into category and used .codes to turn it into numbers**

In [9]:
df_object = df.select_dtypes(include='object')


for key in df_object.columns:
    df[key] =  pd.Categorical(df_object[key]).codes + 1
    
    
df.update(df)    

In [10]:
df.isna().sum()

id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64

In [11]:
if df.isna().sum().any() > 0:
    print("There is still null values in the Dataset")
else:
    print("No null Values in the dataset")

No null Values in the dataset


## **Split the data into training , validation and testing**

In [12]:
x = df.drop("price" , axis = 1)
y= df.price

In [13]:
from sklearn.model_selection import train_test_split


x_train , x_test , y_train , y_test = train_test_split(x , y ,
                                                       test_size=0.4 ,
                                                       random_state=42)


x_val , x_test , y_val , y_test = train_test_split(x , y ,
                                                       test_size=0.5 ,
                                                       random_state=42)

In [14]:
len(x_train) , len(x_test) , len(x_val)

(113119, 94267, 94266)

# **Tune hyperparameters**

## GradientBoostingRegressor

In [15]:
#tuning diferent hyparameters
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

np.random.seed(42)

gb = GradientBoostingRegressor()

parameters = {
    "n_estimators" : np.arange(10, 2000, 50) ,
    "max_depth" : [None, 3,20, 3 ],
    "learning_rate" : [0.01, 0.1, 1 , 0.05] ,
    "subsample" : [0.6 , 0.7, 0.8 , 0.9 , 1.0]
}

cv = RandomizedSearchCV(gb , 
                        param_distributions=parameters , 
                        cv = 5,
                        n_iter = 5,
                        verbose = 1)

cv.fit(x_train , y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [34]:
# create a model with the best hyperparamaters

def best_parameters():
    # Get the best parameters from the search
    best_grid = cv.best_estimator_ 
    
    
    #Fitting the model
    best_grid.fit(x_train,y_train)
    
    return best_grid

gb_model = best_parameters()

In [35]:
gb_model

## Bagging (RandomForestRegressor)

In [37]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs= -1 , 
                           random_state=42 )
parameters = {
    "n_estimators" : np.arange(10, 2000, 50),
    "max_depth" : np.arange(3, 20 ,5) ,
    "bootstrap": [True, False], 
    "max_features" : [None, "sqrt", "log2", 1, 1.0,] ,
    'min_samples_split'  : [15 , 20],
    "min_weight_fraction_leaf" : np.arange(0.0, 0.5, 11) ,
    "oob_score" : [True ,False] ,
    "warm_start": [True,False]
}

cv = RandomizedSearchCV(rf ,
                  param_distributions = parameters ,
                  n_iter= 5 ,
                  cv = 5 ,
                  verbose = 1)

cv.fit(x_train , y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KeyboardInterrupt: 

In [None]:
# create a model with the best hyperparamaters

def best_parameters():
    # Get the best parameters from the search
    best_grid = cv.best_params_ 
    
    # Instantiate the model and Unpack the best_parameters dictionary into the RandomForestRegressor constructor (** means unpacking the best parameters)
    model = RandomForestRegressor(**best_grid)
    
    #Fitting the model
    model.fit(x_train,y_train)
    
    return model

rf_model = best_parameters()

In [None]:
rf_model

## StackingRegressor

In [38]:
from sklearn.ensemble import StackingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression 

# Define the base models (estimators) for stacking

estimators = [
    ('gb', GradientBoostingRegressor(random_state=42)),  # Use GradientBoostingRegressor for regression
    ('rf', RandomForestRegressor(random_state=42))       # Use RandomForestRegressor for regression
]

sc = StackingRegressor(estimators = estimators , final_estimator=LinearRegression())

In [39]:
parameters = {
    'gb__n_estimators' : np.arange(10, 2000, 50),
    'gb__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'rf__n_estimators' : np.arange(10, 2000, 50),
    'rf__bootstrap': [True, False],
    'passthrough' : [True , False],
}

cv = RandomizedSearchCV(sc , 
                        param_distributions = parameters , 
                        n_iter=5,
                        cv = 5 ,
                        random_state=42 ,
                        verbose = 1)

cv.fit(x_train, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


KeyboardInterrupt: 

In [None]:
# create a model with the best hyperparamaters

def best_parameters():
    
    # Get the best parameters from the search
    best_params = cv.best_params_ 
    
    # Extract base model parameters from best_params
    gb_n_estimators = best_params.get('gb__n_estimators', 100)
    gb_subsample = best_params.get('gb__subsample', 1.0)
    rf_n_estimators = best_params.get('rf__n_estimators', 100)
    rf_bootstrap = best_params.get('rf__bootstrap', True)
    passthrough = best_params.get('passthrough', False)
    
    
     # Reinstantiate StackingRegressor with the best parameters
    # Create and fit the final model with the best parameters
    model = StackingRegressor(
        estimators=[
            ('gb', GradientBoostingRegressor(n_estimators=gb_n_estimators,subsample=gb_subsample, random_state=42)),
            ('rf', RandomForestRegressor(n_estimators=rf_n_estimators,bootstrap=rf_bootstrap, random_state=42))
        ],
        final_estimator=LinearRegression(),
        passthrough=passthrough
    )
    
    
    
    
    #Fitting the model
    model.fit(x_train,y_train)
    
    return model

sk_model = best_parameters()

## XGBoostRegressor : BayesSearchCV

# **Evaluation on the dataset**

In [40]:
# function with every regression metric for evaluatuion and model performance

from sklearn.metrics import mean_absolute_error , mean_squared_error , mean_squared_log_error , r2_score 

# Function that calculates root mean sqaured error
def rmse(y_train , y_preds):
    
    "calculates the root mean squared error "
    
    return np.sqrt(mean_squared_error(y_train, y_preds))


# Function that calculates root mean squared log error
def rmsle(y_train , y_preds):
    "calculates the root mean squared log error"
    
    return np.sqrt(mean_squared_log_error(y_train , y_preds))



# Function for all regression metrics
def all_metrics():
    y_val_preds_gb = gb_model.predict(x_val)
    y_val_preds_rf = rf_model.predict(x_val)
    y_val_preds_sk = sk_model.predict(x_val)

    scores = { 
             "Validation Metrics (GradientBoostingRegressor)" : {     
             "Validation MSE" : float(mean_absolute_error(y_val ,y_val_preds_gb)) , 
             "Validation MSLE" : float(mean_squared_log_error(y_val , y_val_preds_gb)) , 
             "Validation R^2" : round(float(r2_score(y_val , y_val_preds_gb) * 100) , 2),
             "Validation RMSE" : float(rmse(y_val , y_val_preds_gb)),
             "Validation RMSLE" : float(rmsle(y_val , y_val_preds_gb)),
            } ,
        
        
            "Validation Metrics (RandomForestRegressor)" : {     
             "Validation MSE" : float(mean_absolute_error(y_val , y_val_preds_rf)) ,
             "Validation MSLE" : float(mean_squared_log_error(y_val , y_val_preds_rf)) , 
             "Validation R^2" : round(float(r2_score(y_val , y_val_preds_rf) * 100) , 2),
             "Validation RMSE" : float(rmse(y_val , y_val_preds_rf)),
             "Validation RMSLE" : float(rmsle(y_val , y_val_preds_rf)),
            } ,
        
        "Validation Metrics(StackingRegressor)" : {     
             "Validation MSE" : float(mean_absolute_error(y_val , y_val_preds_sk)) ,
             "Validation MSLE" : float(mean_squared_log_error(y_val ,y_val_preds_sk)) , 
             "Validation R^2" : round(float(r2_score(y_val ,y_val_preds_sk) * 100) , 2),
             "Validation RMSE" : float(rmse(y_val , y_val_preds_sk)),
             "Validation RMSLE" : float(rmsle(y_val , y_val_preds_sk)),
            } ,
        
    }
    

    
    print("\nValidation Metrics(GradientBoostingRegressor):")
    for key, value in scores["Validation Metrics (GradientBoostingRegressor)"].items():
        print(f"{key}: {value}")
    
    print("\nValidation Metrics(RandomForestRegressor):")
    for key, value in scores["Validation Metrics (RandomForestRegressor)"].items():
        print(f"{key}: {value}")
        
        
    print("\nValidation Metrics(StackingRegressor):")
    for key, value in scores["Validation Metrics(StackingRegressor)"].items():
        print(f"{key}: {value}")
          
    
    
all_metrics()
   
    

NameError: name 'rf_model' is not defined