# Model 6: Boosting

## Import the libraries and cleaning the dataset

In [2]:
### Importing the libraries
import numpy as np
import pandas as pd
#from sklearnex import patch_sklearn

#patch_sklearn()

#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config

#GradientBoostingClassifier and AdaboostClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier

#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn. preprocessing import StandardScaler


pd.set_option("display.max_columns",None)

In [3]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)

In [4]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [5]:
X = train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

In [6]:
# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

## Boosting general comment and information

- Fitting small tree that are slowly learning to improve the f_hat in order to improve the errors that were made 
- 3 Tuning parameter : number of trees, Shrinkage parameter lambda (learning rate) and the max depth / splits of each tree d (1 or 2 works well)

# AdaBoost

## First try with AdaBoost 

In case of Adaptive Boosting or AdaBoost, it minimises the exponential loss function that can make the algorithm sensitive to the outliers. With Gradient Boosting, any differentiable loss function can be utilised. Gradient Boosting algorithm is more robust to outliers than AdaBoost. 

https://analyticsindiamag.com/adaboost-vs-gradient-boosting-a-comparison-of-leading-boosting-algorithms/#:~:text=outliers%20than%20AdaBoost.-,Flexibility,Boosting%20more%20flexible%20than%20AdaBoost

- The base estimator will be a classification tree with depth. = 1

In [7]:

numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", AdaBoostClassifier(random_state = 10))]
)


set_config(display="diagram")
clf

In [14]:
#Learning rate : Weight applied to each classifier at each boosting iteration. 
#A higher learning rate increases the contribution of each classifier. 
#There is a trade-off between the learning_rate and n_estimators parameters. 
#Values must be in the range (0.0, inf).

hyper_param = {"classifier__n_estimators":np.arange(50,250,50),
              "classifier__learning_rate":np.linspace(10**-4, 1,10)}
adaboost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [43]:
np.linspace(10**-4, 1,10)

array([1.000e-04, 1.112e-01, 2.223e-01, 3.334e-01, 4.445e-01, 5.556e-01,
       6.667e-01, 7.778e-01, 8.889e-01, 1.000e+00])

In [15]:
adaboost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


In [18]:
cv_res = pd.DataFrame(adaboost_pipe_cv.cv_results_)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,57.190209,2.849433,1.773595,0.122916,0.0001,50,"{'classifier__learning_rate': 0.0001, 'classif...",0.689251,0.689526,0.696548,0.690871,0.696456,0.69253,0.003289,37
1,133.30829,6.859897,3.102178,0.524109,0.0001,100,"{'classifier__learning_rate': 0.0001, 'classif...",0.689251,0.689526,0.696548,0.690871,0.696456,0.69253,0.003289,37
2,159.021076,21.719599,4.146591,0.453067,0.0001,150,"{'classifier__learning_rate': 0.0001, 'classif...",0.689251,0.689526,0.696548,0.690871,0.696456,0.69253,0.003289,37
3,229.994725,12.715532,6.786923,0.318177,0.0001,200,"{'classifier__learning_rate': 0.0001, 'classif...",0.689251,0.689526,0.696548,0.690871,0.696456,0.69253,0.003289,37
4,59.674941,1.875025,1.741645,0.120127,0.1112,50,{'classifier__learning_rate': 0.11120000000000...,0.827046,0.835012,0.830418,0.828496,0.832067,0.830608,0.002782,36
5,110.675882,6.475088,3.007465,0.409758,0.1112,100,{'classifier__learning_rate': 0.11120000000000...,0.84252,0.845358,0.838934,0.840491,0.84397,0.842255,0.002314,35
6,149.527171,12.175515,4.400306,0.592401,0.1112,150,{'classifier__learning_rate': 0.11120000000000...,0.849478,0.849936,0.845985,0.846534,0.849098,0.848206,0.001621,33
7,232.734885,6.335055,6.370712,0.222273,0.1112,200,{'classifier__learning_rate': 0.11120000000000...,0.853324,0.853232,0.848823,0.850746,0.851113,0.851448,0.001685,31
8,64.475891,4.794022,1.876331,0.161303,0.2223,50,"{'classifier__learning_rate': 0.2223, 'classif...",0.843161,0.846731,0.841681,0.842688,0.845527,0.843958,0.001875,34
9,120.730401,3.483841,4.358252,1.099456,0.2223,100,"{'classifier__learning_rate': 0.2223, 'classif...",0.853873,0.854697,0.849464,0.849739,0.851479,0.85185,0.002121,29


In [30]:
mean_scores = adaboost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = adaboost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[adaboost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]

#one_stand_error_data_frame = adaboost_pipe_cv.cv_results_[adaboost_pipe_cv.cv_results_["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,213.442339,10.960712,6.155756,0.755581,0.6667,200,{'classifier__learning_rate': 0.66670000000000...,0.864127,0.867973,0.858804,0.860635,0.859079,0.862124,0.003485,4


In [31]:
clf.set_params(classifier__learning_rate= 0.6667, classifier__n_estimators = 200)

In [32]:
clf.fit(X,y)

### Export the predictions to .csv

In [34]:
Adaboost_200_067 = test_df[["id"]]
y_test_pred = clf.predict(test_df.drop("id",axis = 1))
print(y_test_pred)
Adaboost_200_067["high_income"] = y_test_pred
Adaboost_200_067

[0 0 0 ... 0 0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Adaboost_200_067["high_income"] = y_test_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [35]:
Adaboost_200_067.to_csv("Predictions/Adaboost_200_067.csv",index = False, header=True)

## Second try with AdaBoost

In [39]:
hyper_param = {"classifier__n_estimators":np.arange(300,600,100),
              "classifier__learning_rate":[1]}
adaboost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [40]:
adaboost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [42]:
cv_res = pd.DataFrame(adaboost_pipe_cv.cv_results_)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,412.37832,25.07028,9.939529,0.223766,1,300,"{'classifier__learning_rate': 1, 'classifier__...",0.867973,0.869529,0.865214,0.863291,0.861643,0.86553,0.002905,3
1,470.399785,6.237278,12.762686,0.596042,1,400,"{'classifier__learning_rate': 1, 'classifier__...",0.869346,0.869896,0.865672,0.863016,0.863657,0.866317,0.002841,2
2,505.648733,71.390799,11.562076,3.19659,1,500,"{'classifier__learning_rate': 1, 'classifier__...",0.869987,0.870262,0.865214,0.863382,0.864756,0.86672,0.002845,1


In [45]:
#Since all of the models are getting better let's take the best one and not the one standard error 
clf.set_params(classifier__learning_rate= 1, classifier__n_estimators = 500)
clf.fit(X,y)

### Export the predictions to .csv

In [46]:
Adaboost_500_1 = test_df[["id"]]
y_test_pred = clf.predict(test_df.drop("id",axis = 1))
print(y_test_pred)
Adaboost_500_1["high_income"] = y_test_pred
Adaboost_500_1

[0 0 0 ... 0 0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Adaboost_500_1["high_income"] = y_test_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [47]:
Adaboost_500_1.to_csv("Predictions/Adaboost_500_1.csv",index = False, header=True)

## Third try with AdaBoost

In [8]:
hyper_param = {"classifier__n_estimators":np.arange(50,250,50),
              "classifier__learning_rate":[0.5]}
adaboost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [9]:
adaboost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [10]:
pd.DataFrame(adaboost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,79.411217,2.486568,2.40565,0.92177,0.5,50,"{'classifier__learning_rate': 0.5, 'classifier...",0.853415,0.854148,0.84919,0.851021,0.851479,0.85185,0.001768,4
1,144.888018,28.071966,3.324838,0.829401,0.5,100,"{'classifier__learning_rate': 0.5, 'classifier...",0.857993,0.860374,0.854684,0.858529,0.856057,0.857527,0.001978,3
2,162.587193,7.306207,4.739735,0.503395,0.5,150,"{'classifier__learning_rate': 0.5, 'classifier...",0.860282,0.863761,0.856973,0.860819,0.856881,0.859743,0.002587,2
3,173.102836,20.878265,4.032979,1.182651,0.5,200,"{'classifier__learning_rate': 0.5, 'classifier...",0.86193,0.865501,0.857248,0.861093,0.858255,0.860805,0.002918,1


**COMMENTS**

There is no improvement compared to the previous models, so we will not export any models.

# GradientBoostingClassifier

## First try with GradientBoostingClassifier

In [7]:
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier(max_depth = 2, n_estimators = 100, learning_rate = .1, random_state=1))]
)


set_config(display="diagram")
clf

In [9]:
hyper_param = {"classifier__n_estimators":[50,100,150],
              "classifier__learning_rate":[0.1, 0.5]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [10]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [11]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,105.724015,2.75115,0.348994,0.057167,0.1,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.843435,0.84664,0.843238,0.843329,0.848182,0.844965,0.002057,6
1,200.124028,8.346743,0.495751,0.069842,0.1,100,"{'classifier__learning_rate': 0.1, 'classifier...",0.853964,0.856345,0.85386,0.853585,0.857064,0.854964,0.001445,5
2,330.605717,3.401773,0.540317,0.043409,0.1,150,"{'classifier__learning_rate': 0.1, 'classifier...",0.859366,0.860099,0.858255,0.857797,0.85917,0.858937,0.000819,4
3,111.338329,1.624213,0.459856,0.037917,0.5,50,"{'classifier__learning_rate': 0.5, 'classifier...",0.864494,0.863578,0.861185,0.862101,0.858896,0.862051,0.00195,3
4,1416.778972,599.004222,0.474014,0.041373,0.5,100,"{'classifier__learning_rate': 0.5, 'classifier...",0.869804,0.868705,0.864848,0.863199,0.86384,0.866079,0.002668,2
5,274.81318,70.381091,0.426003,0.168658,0.5,150,"{'classifier__learning_rate': 0.5, 'classifier...",0.870811,0.869896,0.865672,0.866221,0.866862,0.867892,0.002065,1


In [13]:
#0.5 seems to be a good learning rate
cv_res = pd.DataFrame(boost_pipe_cv.cv_results_)
mean_scores = boost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = boost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[boost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]

#one_stand_error_data_frame = adaboost_pipe_cv.cv_results_[adaboost_pipe_cv.cv_results_["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,274.81318,70.381091,0.426003,0.168658,0.5,150,"{'classifier__learning_rate': 0.5, 'classifier...",0.870811,0.869896,0.865672,0.866221,0.866862,0.867892,0.002065,1


In [14]:
hyper_param = {"classifier__n_estimators":[120,130,140],
              "classifier__learning_rate":[ 0.5, 0.6]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [15]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [16]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,245.181538,5.891427,0.468088,0.09577,0.5,120,"{'classifier__learning_rate': 0.5, 'classifier...",0.869621,0.869346,0.865397,0.865946,0.865489,0.86716,0.001908,6
1,274.644185,1.895358,0.657985,0.116856,0.5,130,"{'classifier__learning_rate': 0.5, 'classifier...",0.869621,0.869987,0.864664,0.866404,0.865397,0.867215,0.002188,5
2,302.477921,4.033478,0.678403,0.217983,0.5,140,"{'classifier__learning_rate': 0.5, 'classifier...",0.869346,0.869804,0.865672,0.866954,0.865763,0.867508,0.001754,4
3,235.549836,14.970818,0.421287,0.076563,0.6,120,"{'classifier__learning_rate': 0.6, 'classifier...",0.868797,0.871727,0.864939,0.865855,0.867045,0.867673,0.002403,2
4,263.265813,14.403825,0.560882,0.011308,0.6,130,"{'classifier__learning_rate': 0.6, 'classifier...",0.868064,0.872276,0.865214,0.865214,0.867137,0.867581,0.002595,3
5,260.758423,62.990416,0.416216,0.154734,0.6,140,"{'classifier__learning_rate': 0.6, 'classifier...",0.867973,0.872642,0.865122,0.866404,0.867228,0.867874,0.002565,1


In [18]:
clf.set_params(classifier__learning_rate = 0.5, classifier__n_estimators = 150)

In [19]:
clf.fit(X,y)

### Export the predictions to .csv

In [22]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [24]:
GB_tree_150_0_5_max_depth_2 = test_df[["id"]]
GB_tree_150_0_5_max_depth_2["high_income"] = y_pred
GB_tree_150_0_5_max_depth_2 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_150_0_5_max_depth_2["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [26]:
GB_tree_150_0_5_max_depth_2.to_csv("Predictions/GB_tree_150_0_5_max_depth_2.csv",index = False, header=True)

## Second try with GradientBoostingClassifier

In [30]:
hyper_param = {"classifier__n_estimators":[600],
              "classifier__learning_rate":[0.05]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=10, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [31]:
boost_pipe_cv.fit(X,y)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [32]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,1354.983562,276.765642,0.464447,0.110556,0.05,600,"{'classifier__learning_rate': 0.05, 'classifie...",0.862663,0.865409,0.868888,0.863578,0.864127,0.860099,0.862846,0.863736,0.862271,0.863004,0.863662,0.002175,1


In [33]:
clf.set_params(classifier__learning_rate = 0.05, classifier__n_estimators = 600)

In [34]:
clf.fit(X,y)

### Export the predictions to .csv

In [35]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
y_pred

array([0, 0, 0, ..., 0, 0, 1])

In [37]:
GB_tree_600_0_05_max_depth_2 = test_df[["id"]]
GB_tree_600_0_05_max_depth_2["high_income"] = y_pred
GB_tree_600_0_05_max_depth_2 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_600_0_05_max_depth_2["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [38]:
GB_tree_600_0_05_max_depth_2.to_csv("Predictions/GB_tree_600_0_05_max_depth_2.csv",index = False, header=True)
#0.861 % 

## Third try with GradientBoostingClassifier

In [None]:
# Could think of increasing the max depth of the tree, restricting the nb of features 21 for the RF
# generally low learning rate more tree but this is very expensive ! 0.1 is usually a good learning rate

In [None]:
# Trying smaller tree but with a higher depth --> Learning rate 0.1 and 30 to 50.

In [8]:
hyper_param = {"classifier__n_estimators":[30,40,50],
              "classifier__learning_rate":[0.1],
              "classifier__max_depth":[8]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [9]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [10]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,226.67957,7.807365,0.423266,0.078332,0.1,8,30,"{'classifier__learning_rate': 0.1, 'classifier...",0.862754,0.865226,0.859995,0.861734,0.863566,0.862655,0.001754,3
1,324.119853,1.077055,0.506118,0.056663,0.1,8,40,"{'classifier__learning_rate': 0.1, 'classifier...",0.864036,0.866416,0.863291,0.863016,0.86439,0.86423,0.0012,2
2,347.584139,47.128289,0.481702,0.117429,0.1,8,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.866325,0.867149,0.865122,0.863749,0.866862,0.865841,0.001255,1


In [11]:
#0.5 seems to be a good learning rate
cv_res = pd.DataFrame(boost_pipe_cv.cv_results_)
mean_scores = boost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = boost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[boost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]

#one_stand_error_data_frame = adaboost_pipe_cv.cv_results_[adaboost_pipe_cv.cv_results_["mean_test_score"] >= (mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,347.584139,47.128289,0.481702,0.117429,0.1,8,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.866325,0.867149,0.865122,0.863749,0.866862,0.865841,0.001255,1


## Further trials for GradientBoostingClassifier

In [1]:
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", GradientBoostingClassifier(max_depth = 2, n_estimators = 100, learning_rate = .1, random_state=10, verbose=1))]
)


set_config(display="diagram")
clf

hyper_param = {"classifier__n_estimators":[50],
              "classifier__learning_rate":[0.1,0.09],
              "classifier__max_depth":[8,10,12]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

NameError: name 'Pipeline' is not defined

In [8]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
      Iter       Train Loss   Remaining Time 
         1           1.1388            3.10m
         2           1.0629            3.02m
         3           1.0012            3.28m
         4           0.9496            3.16m
         5           0.9062            3.08m
         6           0.8688            3.02m
         7           0.8364            2.93m
         8           0.8081            2.84m
         9           0.7833            2.76m
        10           0.7612            2.67m
        20           0.6281            1.89m
      Iter       Train Loss   Remaining Time 
         1           1.1378            6.44m
         2           1.0615            5.99m
         3           0.9993            5.94m
         4           0.9476            5.84m
         5           0.9037            5.82m
         6           0.8665            5.57m
         7           0.8341            5.35m
         8           0.8052           

        40           0.5314           38.81s
        50           0.5086            0.00s


      Iter       Train Loss   Remaining Time 
         1           1.1369            6.29m
         2           1.0612            5.74m
         3           0.9988            5.87m
         4           0.9466            5.84m
         5           0.9028            5.82m
         6           0.8647            5.50m
         7           0.8322            5.28m
         8           0.8039            5.10m
         9           0.7787            4.95m
        10           0.7563            4.81m
        20           0.6206            3.52m
        30           0.5583            2.35m
        40           0.5204            1.18m
        50           0.4957            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.1395            4.14m
         2           1.0637            4.45m
         3           1.0011            4.70m
         4           0.9492            4.74m
         5           0.9047            4.77m
         6           0.8667            4.76m
        

In [9]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,365.263924,1.796232,0.480564,0.110468,0.1,8,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.864219,0.867424,0.863566,0.866404,0.865946,0.865512,0.001421,1
1,431.070508,7.212868,0.553187,0.069492,0.1,10,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.864219,0.86724,0.864664,0.864207,0.865763,0.865219,0.001159,4
2,484.622157,3.54658,0.75884,0.196375,0.1,12,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.863303,0.867424,0.864115,0.865214,0.865672,0.865145,0.001409,5
3,357.204815,3.114612,0.636524,0.087125,0.09,8,50,"{'classifier__learning_rate': 0.09, 'classifie...",0.865318,0.868797,0.862467,0.865305,0.864939,0.865365,0.002018,3
4,446.592846,16.319818,0.717179,0.125965,0.09,10,50,"{'classifier__learning_rate': 0.09, 'classifie...",0.864585,0.867424,0.865672,0.864207,0.865031,0.865384,0.001131,2
5,384.663077,88.757744,0.431733,0.146597,0.09,12,50,"{'classifier__learning_rate': 0.09, 'classifie...",0.863395,0.8666,0.861093,0.862925,0.865214,0.863845,0.001902,6


In [None]:
# It seems that a max depth of 8 perform better. and with a learning rate of 0.1 on a model with only 50 estimators 

In [11]:
hyper_param = {"classifier__n_estimators":[50],
              "classifier__learning_rate":[0.1],
              "classifier__max_depth":[1,4,6]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [12]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
      Iter       Train Loss   Remaining Time 
         1           1.1461            2.49m
         2           1.0759            2.52m
         3           1.0185            2.50m
         4           0.9702            2.42m
         5           0.9299            2.35m
         6           0.8950            2.32m
         7           0.8653            2.28m
         8           0.8392            2.23m
         9           0.8168            2.18m
        10           0.7960            2.13m
        20           0.6764            1.60m
        30           0.6239            1.07m
        40           0.5931           31.49s
      Iter       Train Loss   Remaining Time 
         1           1.1987           52.48s
         2           1.1680           53.33s
         3           1.1424           52.65s
         4           1.1176           51.71s
         5           1.0970           50.38s
         6           1.0786           

      Iter       Train Loss   Remaining Time 
         1           1.1958           52.02s
         2           1.1650           53.31s
         3           1.1397           52.52s
         4           1.1152           51.40s
         5           1.0942           49.89s
         6           1.0760           48.68s
         7           1.0614           47.54s
         8           1.0453           46.33s
         9           1.0316           45.02s
        10           1.0198           43.92s
        20           0.9318           32.61s
        30           0.8763           21.94s
        40           0.8358           10.95s
        50           0.8051            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.1984           47.19s
         2           1.1675           52.47s
         3           1.1421           53.11s
         4           1.1175           53.62s
         5           1.0964           51.81s
         6           1.0782           50.94s
        

In [13]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,56.718018,1.403295,0.343789,0.062519,0.1,1,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.811939,0.816334,0.816409,0.817233,0.817782,0.815939,0.002071,3
1,223.950851,0.6012,0.676708,0.172143,0.1,4,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.858726,0.860557,0.859811,0.857888,0.857888,0.858974,0.001061,2
2,264.835851,71.35042,0.438767,0.189936,0.1,6,50,"{'classifier__learning_rate': 0.1, 'classifier...",0.864311,0.865684,0.863108,0.86146,0.864939,0.8639,0.001484,1


In [None]:
# It seems that a depth of 8 it the best for boosting --> now we need to find the best nb of tree.

In [14]:
hyper_param = {"classifier__n_estimators":[60,70,80,90,100],
              "classifier__learning_rate":[0.1],
              "classifier__max_depth":[8]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [15]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
      Iter       Train Loss   Remaining Time 
         1           1.1369            7.84m
         2           1.0612            7.54m
         3           0.9988            7.44m
         4           0.9466            7.28m
         5           0.9028            7.14m
         6           0.8647            6.96m
         7           0.8322            6.72m
         8           0.8039            6.65m
         9           0.7787            6.54m
        10           0.7563            6.43m
        20           0.6206            4.97m
        30           0.5583            3.74m
        40           0.5204            2.50m
        50           0.4957            1.24m
        60           0.4777            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.1369            5.87m
         2           1.0612            6.66m
         3           0.9988            7.18m
         4           0.9466           

         3           1.0012            6.45m
         4           0.9496            6.52m
         5           0.9062            6.40m
         6           0.8688            6.36m
         7           0.8364            6.29m
         8           0.8081            6.25m
         9           0.7833            6.21m
        10           0.7612            6.17m
      Iter       Train Loss   Remaining Time 
         1           1.1371            7.86m
         2           1.0610            7.70m
         3           0.9991            7.46m
         4           0.9474            7.32m
         5           0.9039            7.18m
         6           0.8664            7.03m
         7           0.8340            6.87m
         8           0.8059            6.79m
         9           0.7808            6.63m
        10           0.7589            6.50m
        20           0.6216            5.02m
        30           0.5599            3.77m
        40           0.5240            2.53m
        5

        70           0.4794            2.36m
        80           0.4680            1.57m
        90           0.4590           47.88s
       100           0.4517            0.00s


In [16]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,455.206674,1.011995,0.557819,0.111942,0.1,8,60,"{'classifier__learning_rate': 0.1, 'classifier...",0.865501,0.868339,0.865763,0.866313,0.86732,0.866647,0.001051,5
1,534.954638,4.306399,0.694264,0.055594,0.1,8,70,"{'classifier__learning_rate': 0.1, 'classifier...",0.865867,0.870445,0.866221,0.866496,0.866496,0.867105,0.001686,4
2,621.534629,15.885423,0.829861,0.130433,0.1,8,80,"{'classifier__learning_rate': 0.1, 'classifier...",0.866233,0.871452,0.867595,0.867228,0.866038,0.867709,0.001961,3
3,768.256876,18.679574,0.759287,0.090406,0.1,8,90,"{'classifier__learning_rate': 0.1, 'classifier...",0.866508,0.871269,0.86851,0.867869,0.866313,0.868094,0.001788,2
4,748.312546,162.911223,0.582443,0.177996,0.1,8,100,"{'classifier__learning_rate': 0.1, 'classifier...",0.866691,0.87191,0.86851,0.868327,0.866954,0.868478,0.001861,1


In [17]:
#0.1 LR
cv_res = pd.DataFrame(boost_pipe_cv.cv_results_)
mean_scores = boost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = boost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[boost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,621.534629,15.885423,0.829861,0.130433,0.1,8,80,"{'classifier__learning_rate': 0.1, 'classifier...",0.866233,0.871452,0.867595,0.867228,0.866038,0.867709,0.001961,3


In [18]:
clf.set_params(classifier__learning_rate = 0.1, classifier__n_estimators = 80, classifier__max_depth = 8)

In [19]:
clf.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1           1.1388            6.05m
         2           1.0629            5.83m
         3           1.0012            5.74m
         4           0.9496            5.67m
         5           0.9062            5.55m
         6           0.8688            5.36m
         7           0.8364            5.07m
         8           0.8081            5.05m
         9           0.7833            5.04m
        10           0.7612            4.96m
        20           0.6281            4.34m
        30           0.5677            3.68m
        40           0.5314            2.91m
        50           0.5086            2.20m
        60           0.4929            1.47m
        70           0.4794           43.96s
        80           0.4680            0.00s


### Export the predictions to .csv

In [20]:
#one standard error rule 
y_pred = clf.predict(test_df.drop("id",axis = 1))
GB_tree_80_LR01_max_depth_8 = test_df[["id"]]
GB_tree_80_LR01_max_depth_8["high_income"] = y_pred
GB_tree_80_LR01_max_depth_8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_80_LR01_max_depth_8["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,1
6066,6067,0


In [21]:
GB_tree_80_LR01_max_depth_8.to_csv("Predictions/GB_tree_80_LR01_max_depth_8.csv",index = False, header=True)


In [7]:
clf.set_params(classifier__learning_rate = 0.1, classifier__n_estimators = 100, classifier__max_depth = 8)

In [8]:
clf.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1           1.1388            7.23m
         2           1.0629            7.04m
         3           1.0012            6.75m
         4           0.9496            6.64m
         5           0.9062            6.62m
         6           0.8688            6.57m
         7           0.8364            6.57m
         8           0.8081            6.48m
         9           0.7833            6.34m
        10           0.7612            6.09m
        20           0.6281            5.77m
        30           0.5677            5.18m
        40           0.5314            4.47m
        50           0.5086            3.82m
        60           0.4929            3.09m
        70           0.4794            2.31m
        80           0.4680            1.54m
        90           0.4590            1.23m
       100           0.4517            0.00s


In [9]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
GB_tree_100_LR01_max_depth_8 = test_df[["id"]]
GB_tree_100_LR01_max_depth_8["high_income"] = y_pred
GB_tree_100_LR01_max_depth_8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_100_LR01_max_depth_8["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,1
6066,6067,0


In [10]:
GB_tree_100_LR01_max_depth_8.to_csv("Predictions/GB_tree_100_LR01_max_depth_8.csv",index = False, header=True)


# Deeper trials

## First try with M1 Pro

In [10]:
hyper_param = {"classifier__n_estimators":[100, 150],
              "classifier__learning_rate":[0.15, 0.2],
              "classifier__max_depth":[2, 3]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [11]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [12]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,39.419275,0.133196,0.133319,0.012459,0.15,2,100,"{'classifier__learning_rate': 0.15, 'classifie...",0.860465,0.860374,0.857522,0.858438,0.858896,0.859139,0.001136,8
1,58.158957,0.325037,0.152689,0.012639,0.15,2,150,"{'classifier__learning_rate': 0.15, 'classifie...",0.863395,0.863853,0.861826,0.860452,0.862284,0.862362,0.001202,6
2,56.290325,0.099685,0.162464,0.035377,0.15,3,100,"{'classifier__learning_rate': 0.15, 'classifie...",0.864951,0.866691,0.864481,0.865122,0.862558,0.864761,0.001328,5
3,84.878327,0.340935,0.166893,0.030195,0.15,3,150,"{'classifier__learning_rate': 0.15, 'classifie...",0.868614,0.869713,0.864664,0.864848,0.864939,0.866555,0.002159,2
4,38.712002,0.284023,0.134711,0.010743,0.2,2,100,"{'classifier__learning_rate': 0.2, 'classifier...",0.86193,0.864768,0.860544,0.860269,0.861368,0.861776,0.001608,7
5,60.566726,0.349964,0.165536,0.034692,0.2,2,150,"{'classifier__learning_rate': 0.2, 'classifier...",0.865867,0.868248,0.863657,0.863199,0.863382,0.864871,0.001943,4
6,58.006089,2.223285,0.12977,0.010928,0.2,3,100,"{'classifier__learning_rate': 0.2, 'classifier...",0.867149,0.866783,0.865397,0.864115,0.863566,0.865402,0.001413,3
7,67.881209,2.609037,0.099727,0.01233,0.2,3,150,"{'classifier__learning_rate': 0.2, 'classifier...",0.868797,0.869896,0.866587,0.865397,0.865672,0.86727,0.001775,1


## Second try with M1 Pro

In [13]:
hyper_param = {"classifier__n_estimators":[100, 150, 200],
              "classifier__learning_rate":[0.3, 0.4, 0.5],
              "classifier__max_depth":[2, 3]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [14]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [15]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,39.077009,0.066949,0.120705,0.005661,0.3,2,100,"{'classifier__learning_rate': 0.3, 'classifier...",0.866416,0.866966,0.862192,0.862925,0.863291,0.864358,0.001945,18
1,58.09123,0.177817,0.136859,0.010335,0.3,2,150,"{'classifier__learning_rate': 0.3, 'classifier...",0.869438,0.867698,0.865031,0.864481,0.865031,0.866336,0.001914,15
2,76.772948,0.324281,0.147263,0.00821,0.3,2,200,"{'classifier__learning_rate': 0.3, 'classifier...",0.87017,0.867973,0.86439,0.867503,0.865031,0.867013,0.002095,13
3,56.501519,0.152837,0.139063,0.01124,0.3,3,100,"{'classifier__learning_rate': 0.3, 'classifier...",0.868064,0.870445,0.864573,0.865855,0.865397,0.866867,0.00213,14
4,85.919472,0.824001,0.158863,0.015478,0.3,3,150,"{'classifier__learning_rate': 0.3, 'classifier...",0.869987,0.872642,0.866221,0.86732,0.86677,0.868588,0.002405,5
5,118.381177,0.809498,0.187784,0.015779,0.3,3,200,"{'classifier__learning_rate': 0.3, 'classifier...",0.871177,0.872642,0.865763,0.866587,0.869151,0.869064,0.002619,3
6,41.305494,0.488099,0.135536,0.013497,0.4,2,100,"{'classifier__learning_rate': 0.4, 'classifier...",0.867515,0.868888,0.865672,0.863749,0.862284,0.865622,0.002405,17
7,59.9086,0.226885,0.15497,0.032697,0.4,2,150,"{'classifier__learning_rate': 0.4, 'classifier...",0.868797,0.870811,0.86732,0.865946,0.866129,0.867801,0.001818,8
8,79.209472,0.201033,0.146366,0.013671,0.4,2,200,"{'classifier__learning_rate': 0.4, 'classifier...",0.870628,0.872368,0.867137,0.866679,0.867137,0.86879,0.002285,4
9,57.690893,0.468243,0.139383,0.003349,0.4,3,100,"{'classifier__learning_rate': 0.4, 'classifier...",0.869163,0.872276,0.864481,0.867411,0.865031,0.867673,0.002851,10


## Third try with M1 Pro

In [16]:
hyper_param = {"classifier__n_estimators":[150, 200, 250, 300],
              "classifier__learning_rate":[0.4, 0.5],
              "classifier__max_depth":[2, 3]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [17]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [20]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,60.362338,0.20958,0.137881,0.015437,0.4,2,150,"{'classifier__learning_rate': 0.4, 'classifier...",0.868797,0.870811,0.86732,0.865946,0.866129,0.867801,0.001818,14
1,80.182985,0.416017,0.15131,0.011869,0.4,2,200,"{'classifier__learning_rate': 0.4, 'classifier...",0.870628,0.872368,0.867137,0.866679,0.867137,0.86879,0.002285,9
2,101.049779,0.420139,0.179223,0.011707,0.4,2,250,"{'classifier__learning_rate': 0.4, 'classifier...",0.870811,0.873558,0.867869,0.868327,0.868419,0.869797,0.002142,3
3,121.588579,0.489774,0.188961,0.020416,0.4,2,300,"{'classifier__learning_rate': 0.4, 'classifier...",0.871452,0.873009,0.868327,0.868052,0.870342,0.870236,0.001876,1
4,90.283707,0.449566,0.162812,0.00648,0.4,3,150,"{'classifier__learning_rate': 0.4, 'classifier...",0.869987,0.873741,0.864939,0.866862,0.867228,0.868552,0.003054,10
5,121.664599,0.557375,0.190112,0.014766,0.4,3,200,"{'classifier__learning_rate': 0.4, 'classifier...",0.871177,0.875023,0.866129,0.867411,0.867686,0.869485,0.003235,8
6,151.87741,0.514,0.208406,0.020232,0.4,3,250,"{'classifier__learning_rate': 0.4, 'classifier...",0.870628,0.874931,0.866404,0.868785,0.867411,0.869632,0.003005,5
7,179.903068,0.224888,0.225624,0.022092,0.4,3,300,"{'classifier__learning_rate': 0.4, 'classifier...",0.871818,0.874657,0.867228,0.867961,0.866404,0.869614,0.003131,6
8,60.140477,0.670912,0.146992,0.021426,0.5,2,150,"{'classifier__learning_rate': 0.5, 'classifier...",0.870811,0.869896,0.865672,0.866221,0.866862,0.867892,0.002065,13
9,80.790621,0.329842,0.163348,0.009247,0.5,2,200,"{'classifier__learning_rate': 0.5, 'classifier...",0.872001,0.871361,0.86732,0.868327,0.86851,0.869504,0.001835,7


In [21]:
# One se-rule
cv_res = pd.DataFrame(boost_pipe_cv.cv_results_)
mean_scores = boost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = boost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[boost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,121.664599,0.557375,0.190112,0.014766,0.4,3,200,"{'classifier__learning_rate': 0.4, 'classifier...",0.871177,0.875023,0.866129,0.867411,0.867686,0.869485,0.003235,8


### Exporting the predictions to .csv

In [22]:
clf.set_params(classifier__learning_rate = 0.4, classifier__n_estimators = 200, classifier__max_depth = 3)

In [23]:
clf.fit(X,y)

In [24]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
GB_tree_200_LR04_max_depth_3 = test_df[["id"]]
GB_tree_200_LR04_max_depth_3["high_income"] = y_pred
GB_tree_200_LR04_max_depth_3

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_200_LR04_max_depth_3["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [25]:
GB_tree_200_LR04_max_depth_3.to_csv("Predictions/GB_tree_200_LR04_max_depth_3.csv",index = False, header=True)

# Fourth Try

In [7]:
hyper_param = {"classifier__n_estimators":[100],
              "classifier__learning_rate":[0.4],
              "classifier__max_depth":[2],
              "classifier__max_features":[21]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [10]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
      Iter       Train Loss   Remaining Time 
         1           1.1837            6.21s
         2           1.1022            5.88s
         3           1.0721            5.82s
         4           1.0114            5.75s
         5           0.9575            5.78s
         6           0.9060            5.78s
         7           0.8815            5.70s
         8           0.8516            5.69s
         9           0.8407            5.63s
        10           0.8273            5.58s
        20           0.7164            5.10s
        30           0.6775            4.49s
        40           0.6571            3.83s
        50           0.6443            3.17s
        60           0.6319            2.52s
        70           0.6230            1.89s
        80           0.6179            1.25s
        90           0.6112            0.62s
       100           0.6065            0.00s


In [12]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__max_features,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,11.597692,2.185763,0.423105,0.085905,0.4,2,21,100,"{'classifier__learning_rate': 0.4, 'classifier...",0.861198,0.864494,0.854684,0.858346,0.859262,0.859597,0.003236,1


In [14]:
clf.set_params(classifier__learning_rate = 0.4, classifier__n_estimators = 100, classifier__max_depth = 2, classifier__max_features = 21)

In [15]:
clf.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1           1.1837            6.56s
         2           1.1022            6.54s
         3           1.0721            6.33s
         4           1.0114            6.69s
         5           0.9575            6.73s
         6           0.9060            6.60s
         7           0.8815            6.47s
         8           0.8516            6.58s
         9           0.8407            6.45s
        10           0.8273            6.44s
        20           0.7164            5.42s
        30           0.6775            4.73s
        40           0.6571            4.02s
        50           0.6443            3.33s
        60           0.6319            2.65s
        70           0.6230            1.99s
        80           0.6179            1.33s
        90           0.6112            0.66s
       100           0.6065            0.00s


In [18]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
y_pred[:10]

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1])

      Iter       Train Loss   Remaining Time 
         1           1.1461           10.65s
         2           1.1111            9.87s
         3           1.0266            9.73s
         4           0.9913            9.65s
         5           0.9442            9.73s
         6           0.9117            9.81s
         7           0.8949            9.53s
         8           0.8823            9.32s
         9           0.8702            9.14s
        10           0.8338            9.05s
        20           0.7254            8.10s
        30           0.6822            7.02s
        40           0.6560            6.03s
        50           0.6380            5.03s
        60           0.6284            3.98s
        70           0.6214            2.96s
        80           0.6144            1.96s
        90           0.6082            0.99s
       100           0.6031            0.00s
      Iter       Train Loss   Remaining Time 
         1           1.1468           12.02s
        

In [20]:
GB_tree_100_LR04_max_depth_2_max_feat_21 = test_df[["id"]]
GB_tree_100_LR04_max_depth_2_max_feat_21["high_income"] = y_pred
GB_tree_100_LR04_max_depth_2_max_feat_21

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_100_LR04_max_depth_2_max_feat_21["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [21]:
GB_tree_100_LR04_max_depth_2_max_feat_21.to_csv("Predictions/GB_tree_100_LR04_max_depth_2_max_feat_21.csv",index = False, header=True)

In [30]:
hyper_param = {"classifier__n_estimators":[300],
              "classifier__learning_rate":[0.1],
              "classifier__max_depth":[2],
              "classifier__max_features":[21]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [31]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
      Iter       Train Loss   Remaining Time 
         1           1.2193           18.08s
         2           1.1924           17.46s
         3           1.1806           17.26s
         4           1.1477           17.31s
         5           1.1179           17.89s
         6           1.0943           17.89s
         7           1.0778           17.83s
         8           1.0592           18.18s
         9           1.0438           18.10s
        10           1.0289           17.94s
        20           0.9075           17.41s
        30           0.8340           16.87s
        40           0.7875           16.38s
        50           0.7571           16.45s
        60           0.7372           15.78s
        70           0.7187           15.13s
        80           0.7042           14.52s
        90           0.6919           13.88s
       100           0.6816           13.07s
       200           0.6339            6

In [32]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__max_features,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,25.216032,5.451491,0.450497,0.105791,0.1,2,21,300,"{'classifier__learning_rate': 0.1, 'classifier...",0.862022,0.863761,0.857431,0.860452,0.857248,0.860183,0.002548,1


## It seems that for a tree to be more robust it needs a lower learning rate, this is what we will try ! 

In [8]:
hyper_param = {"classifier__n_estimators":[100, 150],
              "classifier__learning_rate":[0.05, 0.01],
              "classifier__max_depth":[2]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [9]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [10]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,40.528996,0.124869,0.162459,0.007154,0.05,2,100,"{'classifier__learning_rate': 0.05, 'classifie...",0.844351,0.846548,0.844703,0.843879,0.847541,0.845404,0.0014,2
1,58.607755,1.215186,0.147949,0.018515,0.05,2,150,"{'classifier__learning_rate': 0.05, 'classifie...",0.850485,0.855063,0.850105,0.850105,0.853768,0.851905,0.002095,1
2,38.036877,0.157853,0.12241,0.002203,0.01,2,100,"{'classifier__learning_rate': 0.01, 'classifie...",0.807361,0.813587,0.812105,0.808442,0.809999,0.810299,0.002291,4
3,45.829656,4.600819,0.101942,0.016656,0.01,2,150,"{'classifier__learning_rate': 0.01, 'classifie...",0.814045,0.817982,0.816775,0.808534,0.815676,0.814602,0.003299,3


In [13]:
hyper_param = {"classifier__n_estimators":[200, 250, 300,350,400,450,500],
              "classifier__learning_rate":[0.05],
              "classifier__max_depth":[2]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [14]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 7 candidates, totalling 35 fits
      Iter       Train Loss   Remaining Time 
         1           1.2019            7.19m
         2           1.1742            7.14m
         3           1.1462            7.00m
         4           1.1211            6.93m
         5           1.1004            6.67m
         6           1.0795            6.44m
         7           1.0616            6.50m
         8           1.0453            6.49m
         9           1.0283            6.48m
        10           1.0147            6.46m
        20           0.9093            6.02m
        30           0.8481            5.59m
        40           0.8076            5.21m
        50           0.7781            4.99m
        60           0.7562            4.68m
        70           0.7388            4.46m
        80           0.7245            4.22m
        90           0.7136            3.91m
       100           0.7033            3.63m
       200           0.6524            

         3           1.1469           10.59m
         4           1.1216           10.86m
         5           1.1007           10.66m
         6           1.0796           10.36m
         7           1.0619           10.16m
         8           1.0458            9.98m
         9           1.0286            9.84m
        10           1.0151            9.73m
        20           0.9097            9.55m
        30           0.8464            9.40m
        40           0.8071            9.05m
        50           0.7775            8.90m
        60           0.7564            8.88m
        70           0.7382            8.88m
        80           0.7245            8.81m
      Iter       Train Loss   Remaining Time 
         1           1.2024            7.18m
         2           1.1745            7.10m
         3           1.1465            6.99m
         4           1.1212            6.94m
         5           1.1004            6.71m
         6           1.0815            6.43m
         

        90           0.7132            8.62m
       100           0.7025            8.36m
      Iter       Train Loss   Remaining Time 
         1           1.2015            7.12m
         2           1.1735            6.99m
         3           1.1453            6.89m
         4           1.1199            6.83m
         5           1.0989            6.62m
         6           1.0778            6.40m
         7           1.0601            6.43m
         8           1.0441            6.43m
         9           1.0269            6.42m
        10           1.0135            6.40m
        20           0.9079            6.00m
        30           0.8447            5.58m
        40           0.8042            5.20m
        50           0.7757            4.98m
        60           0.7539            4.68m
        70           0.7372            4.45m
        80           0.7229            4.20m
        90           0.7118            3.89m
       100           0.7016            3.61m
       20

       200           0.6512            6.32m
      Iter       Train Loss   Remaining Time 
         1           1.2046            7.31m
         2           1.1767            7.18m
         3           1.1484            7.00m
         4           1.1230            6.93m
         5           1.1000            6.67m
         6           1.0809            6.44m
         7           1.0632            6.49m
         8           1.0471            6.49m
         9           1.0298            6.45m
        10           1.0162            6.42m
        20           0.9106            6.00m
        30           0.8471            5.56m
        40           0.8074            5.19m
        50           0.7786            4.96m
        60           0.7563            4.66m
        70           0.7387            4.44m
        80           0.7246            4.21m
        90           0.7126            3.90m
       100           0.7026            3.61m
       200           0.6506            0.00s
      Ite

       300           0.6292            4.29m
       400           0.6151            2.12m
       500           0.6058            0.00s


In [15]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,439.011922,5.684243,0.544244,0.087496,0.05,2,200,"{'classifier__learning_rate': 0.05, 'classifie...",0.855063,0.857627,0.854775,0.853585,0.856149,0.85544,0.001365,7
1,515.2141,8.080203,0.656658,0.045148,0.05,2,250,"{'classifier__learning_rate': 0.05, 'classifie...",0.856803,0.859458,0.85679,0.856698,0.856515,0.857253,0.001107,6
2,605.466142,6.854841,0.722937,0.037536,0.05,2,300,"{'classifier__learning_rate': 0.05, 'classifie...",0.859458,0.860099,0.858072,0.857797,0.858621,0.858809,0.000859,5
3,2474.484029,887.433849,0.869005,0.098129,0.05,2,350,"{'classifier__learning_rate': 0.05, 'classifie...",0.860374,0.861198,0.859811,0.859995,0.859537,0.860183,0.000576,4
4,876.132262,7.018124,0.91209,0.107991,0.05,2,400,"{'classifier__learning_rate': 0.05, 'classifie...",0.861655,0.862388,0.859903,0.860819,0.860178,0.860988,0.000924,3
5,979.451638,6.315458,1.108599,0.273493,0.05,2,450,"{'classifier__learning_rate': 0.05, 'classifie...",0.863578,0.862846,0.860727,0.861185,0.861734,0.862014,0.001055,2
6,925.680881,145.205593,0.82411,0.416121,0.05,2,500,"{'classifier__learning_rate': 0.05, 'classifie...",0.864036,0.863761,0.861734,0.862009,0.861643,0.862637,0.001041,1


In [16]:
# One se-rule
cv_res = pd.DataFrame(boost_pipe_cv.cv_results_)
mean_scores = boost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = boost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[boost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
6,925.680881,145.205593,0.82411,0.416121,0.05,2,500,"{'classifier__learning_rate': 0.05, 'classifie...",0.864036,0.863761,0.861734,0.862009,0.861643,0.862637,0.001041,1


In [17]:
# Very interesting CV score of 0.862637 --> Let's try it 
clf.set_params(classifier__learning_rate = 0.05, classifier__n_estimators = 500, classifier__max_depth = 2)
clf.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1           1.2030           10.94m
         2           1.1750           10.70m
         3           1.1469           10.53m
         4           1.1216           10.63m
         5           1.1007           10.57m
         6           1.0796           10.51m
         7           1.0619           10.48m
         8           1.0458           10.43m
         9           1.0286           10.39m
        10           1.0151           10.32m
        20           0.9097            9.40m
        30           0.8464            8.95m
        40           0.8071            8.61m
        50           0.7775            8.41m
        60           0.7564            8.32m
        70           0.7382            8.21m
        80           0.7245            8.20m
        90           0.7132            8.17m
       100           0.7025            8.11m
       200           0.6512            6.44m
       300           0.6292            4.39m
       40

In [18]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
y_pred[:10]

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 1])

In [19]:
GB_tree_500_LR0_05_max_depth_2 = test_df[["id"]]
GB_tree_500_LR0_05_max_depth_2["high_income"] = y_pred
GB_tree_500_LR0_05_max_depth_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_500_LR0_05_max_depth_2["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [20]:
GB_tree_500_LR0_05_max_depth_2.to_csv("Predictions/GB_tree_500_LR0_05_max_depth_2.csv",index = False, header=True)
# Worse score than when using 600 tree --> need to increase the nb of tree had a cv of 0.863662

# Final try : 

In [22]:
hyper_param = {"classifier__n_estimators":[700,800,900,1000],
              "classifier__learning_rate":[0.05],
              "classifier__max_depth":[2]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [23]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
      Iter       Train Loss   Remaining Time 
         1           1.2019           30.95m
         2           1.1742           28.91m
         3           1.1462           28.59m
         4           1.1211           28.18m
         5           1.1004           28.20m
         6           1.0795           28.15m
         7           1.0616           28.09m
         8           1.0453           27.96m
         9           1.0283           27.88m
        10           1.0147           27.77m
        20           0.9093           27.27m
        30           0.8481           26.13m
        40           0.8076           25.59m
        50           0.7781           24.99m
        60           0.7562           24.46m
        70           0.7388           23.92m
        80           0.7245           23.41m
        90           0.7136           23.01m
       100           0.7033           22.71m
       200           0.6524           1

        80           0.7245           19.46m
        90           0.7132           19.17m
       100           0.7025           18.91m
      Iter       Train Loss   Remaining Time 
         1           1.2046           30.98m
         2           1.1767           29.00m
         3           1.1484           28.72m
         4           1.1230           28.41m
         5           1.1000           28.33m
         6           1.0809           28.23m
         7           1.0632           28.22m
         8           1.0471           28.04m
         9           1.0298           27.94m
        10           1.0162           27.81m
        20           0.9106           27.34m
        30           0.8471           26.23m
        40           0.8074           25.70m
        50           0.7786           25.15m
        60           0.7563           24.58m
        70           0.7387           24.05m
        80           0.7246           23.55m
        90           0.7126           23.16m
       10

       300           0.6292           14.51m
       400           0.6151           12.54m
       500           0.6058           10.45m
       600           0.5988            8.34m
       700           0.5931            6.25m
       800           0.5887           16.56m
       900           0.5850            7.58m
      1000           0.5812            0.00s


In [24]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,2887.199962,719.174211,1.134479,0.137871,0.05,2,700,"{'classifier__learning_rate': 0.05, 'classifie...",0.865959,0.867424,0.863016,0.864023,0.863657,0.864816,0.001632,4
1,1671.037972,15.835501,1.278044,0.095429,0.05,2,800,"{'classifier__learning_rate': 0.05, 'classifie...",0.867515,0.867607,0.863016,0.864573,0.863566,0.865255,0.001948,3
2,2201.491298,250.485545,1.33444,0.196536,0.05,2,900,"{'classifier__learning_rate': 0.05, 'classifie...",0.869255,0.868705,0.864481,0.864756,0.863016,0.866043,0.002476,2
3,2096.720058,276.138373,0.99538,0.301023,0.05,2,1000,"{'classifier__learning_rate': 0.05, 'classifie...",0.869529,0.869621,0.864115,0.865397,0.863932,0.866519,0.002546,1


In [25]:
# One se-rule
cv_res = pd.DataFrame(boost_pipe_cv.cv_results_)
mean_scores = boost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = boost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[boost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,2201.491298,250.485545,1.33444,0.196536,0.05,2,900,"{'classifier__learning_rate': 0.05, 'classifie...",0.869255,0.868705,0.864481,0.864756,0.863016,0.866043,0.002476,2


In [26]:
# Very interesting CV score of 0.866043 --> Let's try it 
clf.set_params(classifier__learning_rate = 0.05, classifier__n_estimators = 900, classifier__max_depth = 2)
clf.fit(X,y)

      Iter       Train Loss   Remaining Time 
         1           1.2030           23.60m
         2           1.1750           22.76m
         3           1.1469           22.94m
         4           1.1216           23.16m
         5           1.1007           22.84m
         6           1.0796           22.43m
         7           1.0619           22.06m
         8           1.0458           21.73m
         9           1.0286           21.61m
        10           1.0151           21.53m
        20           0.9097           19.96m
        30           0.8464           19.34m
        40           0.8071           18.86m
        50           0.7775           18.66m
        60           0.7564           18.25m
        70           0.7382           17.94m
        80           0.7245           17.73m
        90           0.7132           17.45m
       100           0.7025           17.18m
       200           0.6512           15.61m
       300           0.6292           13.17m
       40

In [27]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
y_pred[:10]

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1])

In [28]:
GB_tree_900_LR0_05_max_depth_2 = test_df[["id"]]
GB_tree_900_LR0_05_max_depth_2["high_income"] = y_pred
GB_tree_900_LR0_05_max_depth_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_900_LR0_05_max_depth_2["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [29]:
GB_tree_900_LR0_05_max_depth_2.to_csv("Predictions/GB_tree_900_LR0_05_max_depth_2.csv",index = False, header=True)
# cv score = 0.866043

# It takes to much time using a lot of tree and might overfit ! for this reason I will try low amount of tree :


In [11]:
hyper_param = {"classifier__n_estimators":[80,90,100],
              "classifier__learning_rate":[0.6,0.7,0.8,0.9],
              "classifier__max_depth":[2]}
boost_pipe_cv = GridSearchCV(estimator = clf,param_grid=hyper_param, scoring="accuracy",
                           cv = KFold(n_splits=5, shuffle=True, random_state=1), n_jobs=-1, verbose=1)

In [12]:
boost_pipe_cv.fit(X,y)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [14]:
pd.DataFrame(boost_pipe_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,172.952985,6.598915,0.412912,0.081954,0.6,2,80,"{'classifier__learning_rate': 0.6, 'classifier...",0.866691,0.869529,0.863566,0.864664,0.862192,0.865329,0.002565,10
1,209.123173,3.603136,0.49461,0.102585,0.6,2,90,"{'classifier__learning_rate': 0.6, 'classifier...",0.867607,0.871269,0.864023,0.863932,0.863382,0.866043,0.003012,4
2,219.928437,10.968266,0.552994,0.073124,0.6,2,100,"{'classifier__learning_rate': 0.6, 'classifier...",0.867149,0.871544,0.863382,0.865672,0.863016,0.866153,0.003092,3
3,165.632725,3.401566,0.513707,0.025718,0.7,2,80,"{'classifier__learning_rate': 0.7, 'classifier...",0.865226,0.869346,0.865855,0.863749,0.860544,0.864944,0.002866,11
4,181.463003,7.026478,0.514709,0.009378,0.7,2,90,"{'classifier__learning_rate': 0.7, 'classifier...",0.866233,0.87072,0.865672,0.865031,0.862192,0.865969,0.002752,5
5,208.312418,10.615847,0.508773,0.039877,0.7,2,100,"{'classifier__learning_rate': 0.7, 'classifier...",0.868431,0.869346,0.86677,0.864023,0.862558,0.866226,0.002577,2
6,159.05384,1.265936,0.491989,0.053776,0.8,2,80,"{'classifier__learning_rate': 0.8, 'classifier...",0.867698,0.86779,0.864298,0.862375,0.862558,0.864944,0.002383,12
7,184.796974,2.157964,0.495202,0.038192,0.8,2,90,"{'classifier__learning_rate': 0.8, 'classifier...",0.866874,0.867424,0.864481,0.865031,0.865397,0.865841,0.00112,6
8,203.920793,2.993018,0.501579,0.032405,0.8,2,100,"{'classifier__learning_rate': 0.8, 'classifier...",0.868248,0.868156,0.864939,0.864481,0.866587,0.866482,0.001569,1
9,159.647033,1.856604,0.497041,0.004276,0.9,2,80,"{'classifier__learning_rate': 0.9, 'classifier...",0.865135,0.869804,0.863108,0.86558,0.865397,0.865805,0.002188,7


In [15]:
# One se-rule
cv_res = pd.DataFrame(boost_pipe_cv.cv_results_)
mean_scores = boost_pipe_cv.cv_results_["mean_test_score"]
best_index = np.argmax(mean_scores)
se_scores = boost_pipe_cv.cv_results_["std_test_score"] / np.sqrt(5)

one_stand_error_data_frame = cv_res[boost_pipe_cv.cv_results_["mean_test_score"] >=(mean_scores[best_index] - se_scores[best_index])]
one_stand_error_data_frame[one_stand_error_data_frame["mean_test_score"] == one_stand_error_data_frame["mean_test_score"].min()]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__learning_rate,param_classifier__max_depth,param_classifier__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
9,159.647033,1.856604,0.497041,0.004276,0.9,2,80,"{'classifier__learning_rate': 0.9, 'classifier...",0.865135,0.869804,0.863108,0.86558,0.865397,0.865805,0.002188,7


In [16]:
# Very interesting CV score of 0.865805 --> Let's try it 
clf.set_params(classifier__learning_rate = 0.9, classifier__n_estimators = 80, classifier__max_depth = 2)
clf.fit(X,y)

In [17]:
y_pred = clf.predict(test_df.drop("id",axis = 1))
y_pred[:10]

array([0, 0, 0, 1, 0, 0, 1, 0, 0, 1])

In [18]:
GB_tree_80_LR09_max_depth_2 = test_df[["id"]]
GB_tree_80_LR09_max_depth_2["high_income"] = y_pred
GB_tree_80_LR09_max_depth_2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  GB_tree_80_LR09_max_depth_2["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [19]:
GB_tree_80_LR09_max_depth_2.to_csv("Predictions/GB_tree_80_LR09_max_depth_2.csv",index = False, header=True)
