In [23]:
# import librabries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import regression algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV
# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline

In [24]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [25]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

# **Regression Task**

In [26]:
x=df.drop('tip',axis=1)
y=df['tip']
le=LabelEncoder()
x['sex']=le.fit_transform(x['sex'])
x['smoker']=le.fit_transform(x['smoker'])
x['day']=le.fit_transform(x['day'])
x['time']=le.fit_transform(x['time'])
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [27]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [None]:
%%time
models = {
    'Linear Regression': LinearRegression(),
    'Support Vector Regression': SVR(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'K-Nearest Neighbors Regression': KNeighborsRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor(),
    'XGBoost Regression': XGBRegressor()
}
model_scores = []
for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    metric=mean_absolute_error(y_test,y_pred)
    model_scores.append((name, metric))

    # print the performing metrics
    # print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    # print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    # print(name, 'R2 Score: ', r2_score(y_test, y_pred))
    # print('\n')
sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}")

Mean Absolute error for Support Vector Regression is  0.57
Mean Absolute error for Linear Regression is  0.67
Mean Absolute error for XGBoost Regression is  0.67
Mean Absolute error for Gradient Boosting Regression is  0.73
Mean Absolute error for K-Nearest Neighbors Regression is  0.73
Mean Absolute error for Random Forest Regression is  0.76
Mean Absolute error for Decision Tree Regression is  0.90
CPU times: user 383 ms, sys: 3.71 ms, total: 387 ms
Wall time: 422 ms


In [None]:
%%time
# create a dictionaries of list of models to evaluate performance with hyperparameter
models = {
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),
          }
for name, (model, params) in models.items():
  pipeline = GridSearchCV(model, params, cv=5)
  pipeline.fit(x_train, y_train)
  y_pred = pipeline.predict(x_test)
  print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
  print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
  print(name, 'R2 Score: ', r2_score(y_test, y_pred))
  print('\n')

LinearRegression MSE:  0.694812968628771
LinearRegression MAE:  0.6703807496461157
LinearRegression R2 Score:  0.4441368826121932


SVR MSE:  1.460718141299992
SVR MAE:  0.8935334948775431
SVR R2 Score:  -0.1686013018011976


DecisionTreeRegressor MSE:  0.8774153020453993
DecisionTreeRegressor MAE:  0.7189481629481629
DecisionTreeRegressor R2 Score:  0.298051667053291


RandomForestRegressor MSE:  0.9572302885714302
RandomForestRegressor MAE:  0.7807387755102043
RandomForestRegressor R2 Score:  0.23419821406984564


KNeighborsRegressor MSE:  0.6640950568462677
KNeighborsRegressor MAE:  0.6203721488595437
KNeighborsRegressor R2 Score:  0.4687117753876745


GradientBoostingRegressor MSE:  0.8106801524004932
GradientBoostingRegressor MAE:  0.7657809818712309
GradientBoostingRegressor R2 Score:  0.35144101065487676


XGBRegressor MSE:  0.6624107100882575
XGBRegressor MAE:  0.6549163442728472
XGBRegressor R2 Score:  0.4700592836840687


CPU times: user 5.26 s, sys: 43.3 ms, total: 5.3 s
Wal