In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

In [2]:
iris = datasets.load_iris()

In [3]:
df = pd.DataFrame(data= np.c_[iris['data'],iris['target']], columns= iris['feature_names'] + ['target'])

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(iris['data'],iris['target'],random_state=2)

In [7]:
from xgboost import XGBClassifier

In [8]:
from sklearn.metrics import accuracy_score

In [9]:
xgb = XGBClassifier(booster = 'gbtree', objective = 'multi:softprob', max_depth = 6, 
                   learning_rate = 0.1, n_estimators = 100, random_state = 2, n_jobs = -1)

In [10]:
xgb.fit(X_train, y_train)

In [11]:
y_pred = xgb.predict(X_test)

In [12]:
score = accuracy_score(y_pred, y_test)

In [13]:
print("Score : {}".format(score))

Score : 0.9736842105263158


In [14]:
X,y = datasets.load_diabetes(return_X_y=True)

In [15]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [17]:
xgb = XGBRegressor(booster = "gbtree", objective = "reg:squarederror", max_depth = 6,
                  learning_rate = 0.1, n_estimators = 100, random_state = 2, n_jobs = -1)

In [18]:
scores = cross_val_score(xgb, X,  y, scoring= "neg_mean_squared_error", cv=5)

In [19]:
rmse = np.sqrt(-scores)

In [20]:
print("RMSE : ",np.round(rmse, 3))

RMSE :  [63.011 59.705 64.538 63.706 64.588]


In [21]:
print("Mean RMSE : {:.3f}".format(rmse.mean()))

Mean RMSE : 63.109


In [23]:
pd.DataFrame(y).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
0,442.0,152.133484,77.093005,25.0,87.0,140.5,211.5,346.0


In [24]:
df = pd.read_csv("Data/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn/Chapter05/atlas-higgs-challenge-2014-v2.csv.gz",
                nrows=250000, compression= "gzip")

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 35 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [26]:
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label,KaggleSet,KaggleWeight
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2.15,0.444,46.062,1.24,-2.475,113.497,0.000814,s,t,0.002653
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,0.725,1.158,-999.0,-999.0,-999.0,46.226,0.681042,b,t,2.233584
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,2.053,-2.028,-999.0,-999.0,-999.0,44.251,0.715742,b,t,2.347389
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,1.660654,b,t,5.446378
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,1.904263,b,t,6.245333


In [29]:
df.drop(columns= ["Weight", "KaggleSet"], inplace=True)

In [30]:
df.rename(columns= {'KaggleWeight':'Weight'}, inplace=True)

In [31]:
label_col = df['Label']
df.drop(columns=['Label'], inplace=True)
df['Label'] = label_col

In [32]:
df.head()

Unnamed: 0,EventId,DER_mass_MMC,DER_mass_transverse_met_lep,DER_mass_vis,DER_pt_h,DER_deltaeta_jet_jet,DER_mass_jet_jet,DER_prodeta_jet_jet,DER_deltar_tau_lep,DER_pt_tot,...,PRI_jet_num,PRI_jet_leading_pt,PRI_jet_leading_eta,PRI_jet_leading_phi,PRI_jet_subleading_pt,PRI_jet_subleading_eta,PRI_jet_subleading_phi,PRI_jet_all_pt,Weight,Label
0,100000,138.47,51.655,97.827,27.98,0.91,124.711,2.666,3.064,41.928,...,2,67.435,2.15,0.444,46.062,1.24,-2.475,113.497,0.002653,s
1,100001,160.937,68.768,103.235,48.146,-999.0,-999.0,-999.0,3.473,2.078,...,1,46.226,0.725,1.158,-999.0,-999.0,-999.0,46.226,2.233584,b
2,100002,-999.0,162.172,125.953,35.635,-999.0,-999.0,-999.0,3.148,9.336,...,1,44.251,2.053,-2.028,-999.0,-999.0,-999.0,44.251,2.347389,b
3,100003,143.905,81.417,80.943,0.414,-999.0,-999.0,-999.0,3.31,0.414,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-0.0,5.446378,b
4,100004,175.864,16.915,134.805,16.405,-999.0,-999.0,-999.0,3.891,16.405,...,0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,0.0,6.245333,b


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 33 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   EventId                      250000 non-null  int64  
 1   DER_mass_MMC                 250000 non-null  float64
 2   DER_mass_transverse_met_lep  250000 non-null  float64
 3   DER_mass_vis                 250000 non-null  float64
 4   DER_pt_h                     250000 non-null  float64
 5   DER_deltaeta_jet_jet         250000 non-null  float64
 6   DER_mass_jet_jet             250000 non-null  float64
 7   DER_prodeta_jet_jet          250000 non-null  float64
 8   DER_deltar_tau_lep           250000 non-null  float64
 9   DER_pt_tot                   250000 non-null  float64
 10  DER_sum_pt                   250000 non-null  float64
 11  DER_pt_ratio_lep_tau         250000 non-null  float64
 12  DER_met_phi_centrality       250000 non-null  float64
 13 

In [34]:
df['Label'].replace(('s', 'b'),(1,0), inplace=True)

In [35]:
X = df.iloc[:,1:31]
y = df.iloc[:,-1]

In [36]:
df['test_weight'] = df['Weight'] * 550000 / len(y)

In [37]:
s = np.sum(df[df['Label']==1]['test_weight'])
b = np.sum(df[df['Label']==0]['test_weight'])

In [40]:
b/s

593.9401931492318

In [41]:
import xgboost as xgb

In [52]:
xgmat = xgb.DMatrix(X, y , missing= -999.0, weight= df['test_weight'])

In [43]:
param = {}

In [44]:
param["objective"] = "binary:logitraw"

In [45]:
param["scale_pos_weight"] = b/s

In [46]:
param["eta"] = 0.1

In [47]:
param["max_depth"] = 6

In [48]:
param["eval_metric"] = "auc"

In [49]:
plst = list(param.items()) + [('eval_metric','ams@0.15')]

In [55]:
watchlist = [(xgmat, 'train')]

In [51]:
num_round = 120

In [56]:
print("loading data end, start to boost trees")
bst = xgb.train(plst, xgmat, num_round, watchlist)
bst.save_model("higgs.model")
print("finished training")

loading data end, start to boost trees




[0]	train-auc:0.91091	train-ams@0.15:3.69846
[1]	train-auc:0.91535	train-ams@0.15:3.99148
[2]	train-auc:0.91800	train-ams@0.15:4.09947
[3]	train-auc:0.91953	train-ams@0.15:4.25035
[4]	train-auc:0.92050	train-ams@0.15:4.24004
[5]	train-auc:0.92133	train-ams@0.15:4.24742
[6]	train-auc:0.92226	train-ams@0.15:4.33231
[7]	train-auc:0.92338	train-ams@0.15:4.35929
[8]	train-auc:0.92389	train-ams@0.15:4.37476
[9]	train-auc:0.92427	train-ams@0.15:4.36366
[10]	train-auc:0.92484	train-ams@0.15:4.36513
[11]	train-auc:0.92543	train-ams@0.15:4.40817
[12]	train-auc:0.92584	train-ams@0.15:4.41476
[13]	train-auc:0.92645	train-ams@0.15:4.46194
[14]	train-auc:0.92688	train-ams@0.15:4.43823
[15]	train-auc:0.92738	train-ams@0.15:4.47224
[16]	train-auc:0.92805	train-ams@0.15:4.52076
[17]	train-auc:0.92848	train-ams@0.15:4.57117
[18]	train-auc:0.92903	train-ams@0.15:4.60247
[19]	train-auc:0.92937	train-ams@0.15:4.65436
[20]	train-auc:0.92989	train-ams@0.15:4.69182
[21]	train-auc:0.93018	train-ams@0.15:4.7138