## select the best model with the best hyperparameters

In [22]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# train test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [23]:
# import the data
df = sns.load_dataset('tips')

In [24]:
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [25]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# Regression Task


In [26]:
# select the feature and variable
x = df.drop('tip',axis = 1)
y=df['tip']

# label encoder categorical data
le = LabelEncoder()
x['sex'] = le.fit_transform(x['sex'])
x['smoker'] = le.fit_transform(x['smoker'])
x['day'] = le.fit_transform(x['day'])
x['time'] = le.fit_transform(x['time'])

# train test split  
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [28]:
# create a dictionary of models
models = {
    "LinearRegression": LinearRegression(),
    "SVR": SVR(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "GradientBoostingRegressor": GradientBoostingRegressor(),
    "XGBRegressor": XGBRegressor(),
    "RandomForestRegressor": RandomForestRegressor()
    
}
# train and predict mdoel wit evalution metrics
model_scores = []
for name, model in models.items():
    # predict the model
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    mse = mean_squared_error(y_test,y_pred)
    mae = mean_absolute_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)
    model_scores.append((name,mse,mae,r2))


# selecthe best model from all above wth evalution metrics
sorted_models = sorted(model_scores, key = lambda x: x[1], reverse = False)
for model in sorted_models:
    print(f"Model: {model[0]} is {model[1]:.2f} mse, {model[2]:.2f} mae, {model[3]:.2f} r2")


Model: SVR is 0.54 mse, 0.57 mae, 0.57 r2
Model: LinearRegression is 0.69 mse, 0.67 mae, 0.44 r2
Model: XGBRegressor is 0.74 mse, 0.67 mae, 0.41 r2
Model: GradientBoostingRegressor is 0.82 mse, 0.73 mae, 0.35 r2
Model: KNeighborsRegressor is 0.84 mse, 0.73 mae, 0.33 r2
Model: RandomForestRegressor is 1.03 mse, 0.81 mae, 0.18 r2
Model: DecisionTreeRegressor is 1.17 mse, 0.84 mae, 0.06 r2
