In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor

In [2]:
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error as mse

In [3]:
X,y = load_diabetes(return_X_y= True)

In [4]:
kfold = KFold(n_splits= 5, shuffle= True, random_state= 2)

In [5]:
def regression_model(model):
    scores = cross_val_score(model, X, y, scoring= "neg_mean_squared_error", cv= kfold)
    rmse = (-scores)**0.5
    return rmse.mean()

In [6]:
regression_model(XGBRegressor(booster = "gblinear"))

55.51751412831203

In [7]:
regression_model(LinearRegression())

55.50936875436023

In [8]:
regression_model(Lasso())

62.64904114426351

In [9]:
regression_model(Ridge())

58.835292374356676

In [10]:
regression_model(XGBRegressor(booster = "gbtree"))

65.9125519300286

In [15]:
def grid_search(params, reg = XGBRegressor(booster = "gblinear")):
    grid_reg = GridSearchCV(reg, params, scoring= "neg_mean_squared_error", cv=kfold)
    grid_reg.fit(X, y)
    best_params = grid_reg.best_params_
    print("Best Params : ", best_params)
    best_score = np.sqrt(-grid_reg.best_score_)
    print("Best Score : ", best_score)

In [16]:
grid_search(params= {"reg_alpha" : [0.001, 0.01, 0.1, 0.5, 1, 5]})

Best Params :  {'reg_alpha': 0.01}
Best Score :  55.49665686756466


In [17]:
grid_search(params= {"reg_lambda" : [0.001, 0.01, 0.1, 0.5, 1, 5]})

Best Params :  {'reg_lambda': 0.001}
Best Score :  56.171700694302324


In [19]:
grid_search(params= {'feature_selector':['shuffle']})

Best Params :  {'feature_selector': 'shuffle'}
Best Score :  55.5486899378168


In [20]:
grid_search(params= {'feature_selector' : ['random', 'greedy', 'thrifty'],
                    'updater' : ['coord_descent']})

Best Params :  {'feature_selector': 'thrifty', 'updater': 'coord_descent'}
Best Score :  55.488143951136536


In [21]:
grid_search(params= {'feature_selector' : ['random', 'greedy', 'thrifty'],
                    'updater' : ['coord_descent'],
                    'top_k' : [3, 5, 7, 9]})

Best Params :  {'feature_selector': 'thrifty', 'top_k': 3, 'updater': 'coord_descent'}
Best Score :  55.47871836076556


In [22]:
X = np.arange(1, 100)

In [23]:
np.random.seed(2)

In [24]:
y = []

In [25]:
for i in X:
    y.append(i * np.random.uniform(-0.2, 0.2))

In [26]:
y = np.array(y)

In [27]:
X = X.reshape(X.shape[0], 1)
y = y.reshape(y.shape[0], 1)

In [28]:
regression_model(XGBRegressor(booster = "gblinear", objective = "reg:squarederror"))

6.214946302686011

In [30]:
regression_model(XGBRegressor(booster = "gbtree", objective = "reg:squarederror"))

9.372359516507444

In [31]:
regression_model(LinearRegression())

6.214962315808842

In [45]:
X,y = load_diabetes(return_X_y= True)

In [33]:
regression_model(XGBRegressor(booster = "dart", objective = "reg:squarederror"))

65.91255196051148

In [34]:
df_census = pd.read_csv("Data/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/Chapter08/census_cleaned.csv")

In [36]:
X_census = df_census.iloc[:,:-1]
y_census = df_census.iloc[:,-1]

In [37]:
def classification_model(model):
    scores = cross_val_score(model, X_census, y_census, scoring= "accuracy", cv=kfold)
    return scores.mean()

In [38]:
classification_model(XGBClassifier(booster = "gbtree"))

0.8701208195968675

In [39]:
classification_model(XGBClassifier(booster = "dart"))

0.8701208195968675

In [40]:
classification_model(XGBClassifier(booster = "gblinear"))

0.850250415819278

In [41]:
classification_model(LogisticRegression(max_iter= 1000))

0.8008968643699182

In [42]:
classification_model(XGBClassifier(booster = "dart", one_drop = 1))

0.8719021746716356

In [46]:
regression_model(XGBRegressor(booster = "dart", objective = "reg:squarederror", sample_type = "weighted"))

65.91255196051148

In [47]:
regression_model(XGBRegressor(booster = "dart", objective = "reg:squarederror",normalize_type = "forest"))

65.91255196051148

In [48]:
regression_model(XGBRegressor(booster = "dart", objective = "reg:squarederror", one_drop = 1))

62.863317693761566

In [49]:
grid_search(params= {"rate_drop" : [0.01, 0.1, 0.2, 0.4]},
           reg= XGBRegressor(booster = "dart", objective = "reg:squarederror", one_drop = 1))

Best Params :  {'rate_drop': 0.01}
Best Score :  62.096735521735894


In [50]:
grid_search(params= {"skip_drop" : [0.01, 0.1, 0.2, 0.4]},
           reg= XGBRegressor(booster = "dart", objective = "reg:squarederror"))

Best Params :  {'skip_drop': 0.01}
Best Score :  65.97459497209029


In [51]:
regression_model(XGBRegressor(booster = "gbtree", objective = "reg:squarederror",
                             num_parallel_tree = 25))

65.95866495368594

In [52]:
regression_model(XGBRegressor(booster = "gbtree", objective = "reg:squarederror",
                             num_parallel_tree = 5))

65.95725434716181

In [53]:
regression_model(XGBRFRegressor(objective = "reg:squarederror"))

59.18988979010296

In [54]:
regression_model(RandomForestRegressor())

59.44526192030271

In [55]:
classification_model(XGBRFClassifier())

0.8554714187199218

In [56]:
classification_model(RandomForestClassifier())

0.8555328202034789