# Selecting the Best Model with Best Hyperparameters

In [2]:
# Import all the necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# import the preprocessing modules 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# import all the models which we have to use in this noteBook 
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# import the cross validation module 
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

# import the metrics to evalute the models 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [3]:
df = sns.load_dataset("tips")
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Task

In [5]:
# now we can select the feature and targets or labels from the dataset
# features
X = df.drop('tip', axis=1)

# Targets 
y = df['tip']

# now, we encode all the categorical variables by using labelencoder
for col in X.columns:
    le = LabelEncoder()
    if X[col].dtype =='category' or X[col].dtype == 'object':
        X[col] = le.fit_transform(X[col])

### Mean Absolute Error

In [6]:
# spliting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create the list of models which we want to use 
models = [
    ("SVM", SVR()),
    ("XGBoost", XGBRegressor()),
    ("Decision Tree", DecisionTreeRegressor()),
    ("Liner Regression", LinearRegression()),
    ("KNeighbors", KNeighborsRegressor()),
    ("Random Forest", RandomForestRegressor()),
    ("Gradient Boosting", GradientBoostingRegressor())
]
model_score = []
# fitting , predicting and evaluating the each model by using for loop 
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # evaluating the model
    mar = mean_absolute_error(y_test, y_pred)
    model_score.append((name, mar))

short_models = sorted(model_score, key=lambda x: x[1], reverse=False)
for model in short_models:
    print(f"Mean Absolute Error for {model[0]} is {model[1]:.2f}")

Mean Absolute Error for SVM is 0.57
Mean Absolute Error for Liner Regression is 0.67
Mean Absolute Error for XGBoost is 0.67
Mean Absolute Error for KNeighbors is 0.73
Mean Absolute Error for Gradient Boosting is 0.73
Mean Absolute Error for Random Forest is 0.78
Mean Absolute Error for Decision Tree is 0.94


### Root Mean Squared Error

higher (high and low)
lower (high and low)

 


In [7]:
# spliting the data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# creating the list of models which we want to compare
models = {
    "SVM": SVR(),
    "XGBoost": XGBRegressor(), 
    "Decision Tree": DecisionTreeRegressor(),
    "LinearRegression": LinearRegression(),
    "Random Forest": RandomForestRegressor(),
    "KNeighbors Regressor": KNeighborsRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
}

model_score = []

for name, model in models.items():
    # fitting the model to the training data
    model.fit(X_train, y_train)

    # predicting the model
    y_pred = model.predict(X_test)

    # evaluating the model by using Root Mean Squared Error (RMSE)
    rmse = root_mean_squared_error(y_test , y_pred)
    model_score.append((name, rmse))

short_models = sorted(model_score, key=lambda x: x[1], reverse=False)
for model in short_models:
    print(f"Root Mean Squared Error for {model[0]} is {model[1]:.2f}")

Root Mean Squared Error for SVM is 0.73
Root Mean Squared Error for LinearRegression is 0.83
Root Mean Squared Error for XGBoost is 0.86
Root Mean Squared Error for Gradient Boosting is 0.89
Root Mean Squared Error for KNeighbors Regressor is 0.92
Root Mean Squared Error for Random Forest is 0.96
Root Mean Squared Error for Decision Tree is 1.04


In [8]:
models = {
    "SVM": (
        SVR(),      
        {
            'kernel': ['linear', 'poly', 'rbf'],
            # 'C': [1.0, 0.1, 0.01],
            # 'epsilon': [0.1, 0.01, 0.001],
            # 'gamma': ['auto'],
            # 'shrinking': [True, False],
            # 'cache_size': [50, 100, 200],
            # 'verbose': [True, False]
        }
    ),
    "Random Forest": (
        RandomForestRegressor(),
        {
            'n_estimators': [10, 100, 1000],
            # 'max_depth': [None, 5, 10],
            # 'min_samples_split': [2, 5, 10]
        }
    ),
    "KNN": (
        KNeighborsRegressor(),
        {
            'n_neighbors': [5, 10, 15],
            # 'algorithm': ['kd_tree', 'ball_tree'],
            # 'weights': ['uniform', 'distance'],
        }
    ),
    "Decision Tree": (
        DecisionTreeRegressor(), 
        {
            'max_depth': [None, 5, 10],
            # 'min_samples_split': [2, 5, 10],
            # 'max_features': ['sqrt', 'log2', None]
        }
    ),
    "Linear Regression": (
        LinearRegression(), {}
    ),
    "XGBoost": (
        XGBRegressor(),
        {
            'n_estimators': [10, 100]
        }
    )
}

model_score = []

# loop through each model

for name, (model, param) in models.items():
    # create a pipeline
    pipeline = GridSearchCV(model, param, cv=5)

    # fit the pipeline to the training data
    pipeline.fit(X_train, y_train)

    # make predictions on the test set
    y_pred = pipeline.predict(X_test)

    # evaluate the model
    mse = mean_squared_error(y_test, y_pred)
    model_score.append((name, mse))

short_models = sorted(model_score, key=lambda x: x[1], reverse=False)
for model in short_models:
    print(f"Mean Squared Error for {model[0]} is {model[1]:.2f}")

Mean Squared Error for XGBoost is 0.66
Mean Squared Error for Linear Regression is 0.69
Mean Squared Error for KNN is 0.69
Mean Squared Error for Decision Tree is 0.88
Mean Squared Error for Random Forest is 0.99
Mean Squared Error for SVM is 1.46


**Assignment:** Find the best model based on each metrics from above mentioned results? with Diamonds dataset

In [9]:
diamond_data = sns.load_dataset("diamonds")
diamond_data.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [10]:
diamond_data.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [11]:
diamond_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [12]:
# selecting the features and targets or labels from the dataset
# feature from the diamond dataset
X = diamond_data.drop("price", axis=1)

# target or labels from the diamond dataset
y = diamond_data['price']

# now, we encode all the categorical variables by using labelEncoder
for col in X.columns:
    le = LabelEncoder()
    if X[col].dtype == 'category':
        X[col] = le.fit_transform(X[col])        

In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  int32  
 2   color    53940 non-null  int32  
 3   clarity  53940 non-null  int32  
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   x        53940 non-null  float64
 7   y        53940 non-null  float64
 8   z        53940 non-null  float64
dtypes: float64(6), int32(3)
memory usage: 3.1 MB


So, We know that there is no categorical or object column in our dataset. We will use all the Regression model easily . But, we need to select the best model for our dataset. For that, we will use the `GridSearchCV` function from the `sklearn library`. This function will help us to find the best model for

In [14]:
%%time
# spliting the data into training and test dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# defining all the models which we want to use
models = {

    'Linear Regression': (LinearRegression(),{}),
    'XGBoost': (XGBRegressor(),{}),

    'SVM' : (SVR(),
        {     
        "kernel":['linear', 'poly','rbf'],
        "C" : [0.1, 1, 10],
        }
    ),

    'KNN':(
        KNeighborsRegressor(),
        {
            'n_neighbors': [1, 5, 10],
        }
    ),
    'Random Forest': (
        RandomForestRegressor(),
        {
            'n_estimators': [10, 50],
        }
    ),
    'Decision Tree' : (
        DecisionTreeRegressor(),
        {
           'criterion': ['squared_error', 'friedman_mse'] 
        }
    ),

}
model_score = []
for name, (model, params) in models.items():
    pipeline = GridSearchCV(model, params, cv=3)
    pipeline.fit(X_train, y_train)

    # predict the model 
    y_pred = pipeline.predict(X_test)

    R2_score = r2_score(y_test, y_pred)
    model_score.append((name, R2_score))

    sort_models = sorted(model_score, key=lambda x: x[1], reverse=True)
    for model in sort_models:
        print(f"Model: {model[0]}, R2 Score: {model[1]:.2f}")    

Model: Linear Regression, R2 Score: 0.89
Model: XGBoost, R2 Score: 0.98
Model: Linear Regression, R2 Score: 0.89
Model: XGBoost, R2 Score: 0.98
Model: Linear Regression, R2 Score: 0.89
Model: SVM, R2 Score: 0.86
Model: XGBoost, R2 Score: 0.98
Model: KNN, R2 Score: 0.95
Model: Linear Regression, R2 Score: 0.89
Model: SVM, R2 Score: 0.86
Model: Random Forest, R2 Score: 0.98
Model: XGBoost, R2 Score: 0.98
Model: KNN, R2 Score: 0.95
Model: Linear Regression, R2 Score: 0.89
Model: SVM, R2 Score: 0.86
Model: Random Forest, R2 Score: 0.98
Model: XGBoost, R2 Score: 0.98
Model: Decision Tree, R2 Score: 0.97
Model: KNN, R2 Score: 0.95
Model: Linear Regression, R2 Score: 0.89
Model: SVM, R2 Score: 0.86
CPU times: total: 21min 19s
Wall time: 21min 24s
