# **Selecting the best model with best hyperparameters**

In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithims
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# import metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# import Grid Search CV
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# import warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load dataset
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Task

In [5]:
# select features and variables
X = df.drop('tip', axis=1)
y = df['tip']

# label encode categorical variables
le = LabelEncoder()
X['sex'] = le.fit_transform(X['sex'])
X['smoker'] = le.fit_transform(X['smoker'])
X['day'] = le.fit_transform(X['day'])
X['time'] = le.fit_transform(X['time'])

In [8]:
%%time

# train test split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# create a dictionary of list of models to evaluate performance
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'SVR': SVR(),
    'KNN': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'XGBoost': XGBRegressor()
}

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

model_scores = []

for name, model in models.items():
    # fit each model from model on training data
    model.fit(X_train, y_train)

    # predict on test data
    y_pred = model.predict(X_test)

    # calculate evaluation metrics
    mse = mean_squared_error(y_test, y_pred)
    model_scores.append((name, mse))


# selecting the best model from all above models with evaluation metrics sorting
sorted_model = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_model:
    print(f'Mean Absolute Error for : ', f'{model[0]} is {model[1]: .2f}')



Mean Absolute Error for :  SVR is  0.54
Mean Absolute Error for :  Linear Regression is  0.69
Mean Absolute Error for :  XGBoost is  0.74
Mean Absolute Error for :  Gradient Boosting is  0.81
Mean Absolute Error for :  KNN is  0.84
Mean Absolute Error for :  Random Forest is  0.91
Mean Absolute Error for :  Decision Tree is  1.28
CPU times: user 805 ms, sys: 7.39 ms, total: 812 ms
Wall time: 866 ms


# **HyperParameter Tunning**

In [10]:
# create a dictionary of list of models to evaluate performance with hyperparameters
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [10, 50, 100]}),
    'Gradient Boosting': (GradientBoostingRegressor(), {'n_estimators': [10, 50,100]}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
    'SVR': (SVR(), {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [10, 50, 100]}),
}

# train and predict each model with evaluation metrics as well making as for loop to iterate over the models

for name, (model, param_grid) in models.items():

    # create a pipeline
    pipeline = GridSearchCV(model, param_grid, cv=5)

    # fit each model from model on training data
    pipeline.fit(X_train, y_train)

    # predict on test data
    y_pred = pipeline.predict(X_test)

    # print the following metrics
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2 Score: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

Linear Regression MSE:  0.694812968628771
Linear Regression R2 Score:  0.4441368826121932
Linear Regression MAE:  0.6703807496461157


Random Forest MSE:  0.9910478834693891
Random Forest R2 Score:  0.2071435179555292
Random Forest MAE:  0.7820591836734697


Gradient Boosting MSE:  0.8106801524004932
Gradient Boosting R2 Score:  0.35144101065487676
Gradient Boosting MAE:  0.7657809818712309


Decision Tree MSE:  0.8774153020453994
Decision Tree R2 Score:  0.2980516670532909
Decision Tree MAE:  0.718948162948163


SVR MSE:  0.6765002335154311
SVR R2 Score:  0.45878740654827843
SVR MAE:  0.6661912705361147


XGBoost MSE:  0.6624107100882575
XGBoost R2 Score:  0.4700592836840687
XGBoost MAE:  0.6549163442728472




In [15]:
# add preprocessor inside the pipeline
# make a pre processor
preprocessor = ColumnTransformer(
    transformers=[('numeric_scaling', StandardScaler(), ['total_bill', 'size'])], remainder='passthrough')

# create a dictionary of list of models to evaluate performance with hyperparameters
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [10, 50, 100]}),
    'Gradient Boosting': (GradientBoostingRegressor(), {'n_estimators': [10, 50,100]}),
    'Decision Tree': (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
    'SVR': (SVR(), {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]}),
    'XGBoost': (XGBRegressor(), {'n_estimators': [10, 50, 100]}),
}

# train and predict each model with evaluation metrics as well making as for loop to iterate over the models

for name, (model, param_grid) in models.items():

    # create a pipeline with pre processor
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # make a grid search CV to tune the hyperparameter tunning
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)

    # fit each model from model on training data
    pipeline.fit(X_train, y_train)

    # predict on test data
    y_pred = pipeline.predict(X_test)

    # print the following metrics
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2 Score: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

Linear Regression MSE:  0.6948129686287685
Linear Regression R2 Score:  0.4441368826121952
Linear Regression MAE:  0.6703807496461148


Random Forest MSE:  0.9387738865306128
Random Forest R2 Score:  0.24896367418268472
Random Forest MAE:  0.771591836734694


Gradient Boosting MSE:  0.7967740097602491
Gradient Boosting R2 Score:  0.3625661797980215
Gradient Boosting MAE:  0.7234711607110227


Decision Tree MSE:  1.2047081632653063
Decision Tree R2 Score:  0.03621137570767141
Decision Tree MAE:  0.8157142857142858


SVR MSE:  0.6213485529050293
SVR R2 Score:  0.502909762487858
SVR MAE:  0.6122661870970085


XGBoost MSE:  0.7402754219220135
XGBoost R2 Score:  0.4077660862214003
XGBoost MAE:  0.6742331907700521




# **Classifiers**

In [18]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# do not show warnings
import warnings
warnings.filterwarnings('ignore')

# load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC()
}

# perform a K-cross validation and calculate the main accuracy
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=kfold)
    accuracy = np.mean(scores)
    print('Classifier: ', name)
    print('Accuracy: ', accuracy)
    print('\n')

Classifier:  Logistic Regression
Accuracy:  0.9733333333333334


Classifier:  Decision Tree
Accuracy:  0.9600000000000002


Classifier:  Random Forest
Accuracy:  0.9600000000000002


Classifier:  KNN
Accuracy:  0.9733333333333334


Classifier:  SVM
Accuracy:  0.9666666666666668




---