 # Age Prediction for Scenario 1

In [1]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
#Importing all necessary libraries
import pandas as pd
import numpy as np
import s3fs
from matplotlib import pyplot as plt 
%matplotlib inline
import seaborn as sns

In [3]:
#loading datasets
scenario_1=pd.read_parquet("s3://capstonemlc/Final_data/scenario_1.parquet")

# Data preprocessing

In [4]:
#shape of dataframe
scenario_1.shape

(23144, 18)

In [5]:
#basic info of dataframe
scenario_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23144 entries, 0 to 23143
Data columns (total 18 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   device_id             23144 non-null  int64  
 1   brand                 23144 non-null  object 
 2   model                 23144 non-null  object 
 3   midnight_counts       23144 non-null  float64
 4   daytime_counts        23144 non-null  float64
 5   early_morning_counts  23144 non-null  float64
 6   night_counts          23144 non-null  float64
 7   monday_counts         23144 non-null  float64
 8   tuesday_counts        23144 non-null  float64
 9   wednesday_counts      23144 non-null  float64
 10  thursday_counts       23144 non-null  float64
 11  friday_counts         23144 non-null  float64
 12  saturday_counts       23144 non-null  float64
 13  sunday_counts         23144 non-null  float64
 14  category              23144 non-null  object 
 15  cluster            

In [6]:
scenario_1.head()

Unnamed: 0,device_id,brand,model,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,sunday_counts,category,cluster,gender,age
0,-7548291590301750000,Huawei,è£è€€3C,73.0,166.0,31.0,22.0,62.0,86.0,93.0,6.0,0.0,0.0,45.0,"industry tag,nature ,personal effectiveness ,i...",0,M,33
1,-1819925713085810000,OPPO,N1 Mini,13.0,16.0,7.0,4.0,12.0,5.0,6.0,4.0,3.0,4.0,6.0,"industry tag,im,relatives ,notes,personal effe...",0,F,23
2,3670076507269740000,Meizu,menote1 2,28.0,56.0,15.0,12.0,21.0,18.0,19.0,17.0,7.0,16.0,13.0,"radio,industry tag,cozy ,services",1,M,33
3,5333872006968810000,Xiaomi,xnote,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,5.0,"debit and credit,others,im,industry tag,relati...",2,M,34
4,5263633571423510000,Huawei,hu1 Plus,10.0,27.0,0.0,0.0,3.0,8.0,6.0,4.0,9.0,7.0,0.0,"im,industry tag,relatives ,services ,weibo,not...",0,M,27


In [7]:
#splitting the category column
scenario_1.category = scenario_1.category.str.split(',')

In [8]:
#first 5 rows
scenario_1.head()

Unnamed: 0,device_id,brand,model,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,sunday_counts,category,cluster,gender,age
0,-7548291590301750000,Huawei,è£è€€3C,73.0,166.0,31.0,22.0,62.0,86.0,93.0,6.0,0.0,0.0,45.0,"[industry tag, nature , personal effectiveness...",0,M,33
1,-1819925713085810000,OPPO,N1 Mini,13.0,16.0,7.0,4.0,12.0,5.0,6.0,4.0,3.0,4.0,6.0,"[industry tag, im, relatives , notes, personal...",0,F,23
2,3670076507269740000,Meizu,menote1 2,28.0,56.0,15.0,12.0,21.0,18.0,19.0,17.0,7.0,16.0,13.0,"[radio, industry tag, cozy , services ]",1,M,33
3,5333872006968810000,Xiaomi,xnote,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,5.0,"[debit and credit, others, im, industry tag, r...",2,M,34
4,5263633571423510000,Huawei,hu1 Plus,10.0,27.0,0.0,0.0,3.0,8.0,6.0,4.0,9.0,7.0,0.0,"[im, industry tag, relatives , services , weib...",0,M,27


In [9]:
#changing null categories into null lists
scenario_1.category.loc[scenario_1.category.isnull()] = scenario_1.category.loc[scenario_1.category.isnull()].apply(lambda x: [])

In [10]:
#one-hot encoding for categories
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
scenario_1 = scenario_1.join(pd.DataFrame(mlb.fit_transform(scenario_1.pop('category')),
                          columns=mlb.classes_,
                          index=scenario_1.index))

In [11]:
# converting type of columns to 'category'
scenario_1['brand'] = scenario_1['brand'].astype('category')
scenario_1['model'] = scenario_1['model'].astype('category')
scenario_1['cluster'] = scenario_1['cluster'].astype('category')
scenario_1['gender'] = scenario_1['gender'].astype('category')

In [12]:
#Basic info dataframe
scenario_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23144 entries, 0 to 23143
Data columns (total 68 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   device_id                23144 non-null  int64   
 1   brand                    23144 non-null  category
 2   model                    23144 non-null  category
 3   midnight_counts          23144 non-null  float64 
 4   daytime_counts           23144 non-null  float64 
 5   early_morning_counts     23144 non-null  float64 
 6   night_counts             23144 non-null  float64 
 7   monday_counts            23144 non-null  float64 
 8   tuesday_counts           23144 non-null  float64 
 9   wednesday_counts         23144 non-null  float64 
 10  thursday_counts          23144 non-null  float64 
 11  friday_counts            23144 non-null  float64 
 12  saturday_counts          23144 non-null  float64 
 13  sunday_counts            23144 non-null  float64 
 14  cluste

In [13]:
scenario_1.head()

Unnamed: 0,device_id,brand,model,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,...,smart shopping,sports and gym,taxi,technology information,tencent,third party payment,total cost,video,wealth management,weibo
0,-7548291590301750000,Huawei,è£è€€3C,73.0,166.0,31.0,22.0,62.0,86.0,93.0,...,0,0,0,1,0,0,0,0,0,0
1,-1819925713085810000,OPPO,N1 Mini,13.0,16.0,7.0,4.0,12.0,5.0,6.0,...,0,0,0,0,0,1,0,0,0,0
2,3670076507269740000,Meizu,menote1 2,28.0,56.0,15.0,12.0,21.0,18.0,19.0,...,0,0,0,0,0,0,0,0,0,0
3,5333872006968810000,Xiaomi,xnote,6.0,9.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5263633571423510000,Huawei,hu1 Plus,10.0,27.0,0.0,0.0,3.0,8.0,6.0,...,0,0,0,1,0,1,1,0,0,1


# Encoding

In [14]:
# generate binary values using get_dummies
scenario_1= pd.get_dummies(scenario_1, columns=["brand"], prefix=["brand"] )
scenario_1= pd.get_dummies(scenario_1, columns=["model"], prefix=["model"] )
scenario_1= pd.get_dummies(scenario_1, columns=["cluster"], prefix=["cluster"] )
scenario_1= pd.get_dummies(scenario_1, columns=["gender"], prefix=["gender"] )

In [15]:
#first 5 rows
scenario_1.head()

Unnamed: 0,device_id,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
0,-7548291590301750000,73.0,166.0,31.0,22.0,62.0,86.0,93.0,6.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,-1819925713085810000,13.0,16.0,7.0,4.0,12.0,5.0,6.0,4.0,3.0,...,0,0,0,0,0,0,0,0,1,0
2,3670076507269740000,28.0,56.0,15.0,12.0,21.0,18.0,19.0,17.0,7.0,...,0,0,0,0,0,0,0,0,0,1
3,5333872006968810000,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,5263633571423510000,10.0,27.0,0.0,0.0,3.0,8.0,6.0,4.0,9.0,...,0,0,0,0,0,0,0,0,0,1


# Split data

In [16]:
#Loading dataset
train_test_split=pd.read_csv("s3://capstonemlc/train_test_split.csv")

### Train data

In [17]:
#extracting the train dataset
train_split = train_test_split[train_test_split['train_test_flag'] == "train"]

In [18]:
#frist 5 rows
train_split.head()

Unnamed: 0,device_id,gender,age,group,train_test_flag
0,-7548291590301750000,M,33,M32+,train
1,6943568600617760000,M,37,M32+,train
2,5441349705980020000,M,40,M32+,train
3,-5393876656119450000,M,33,M32+,train
4,4543988487649880000,M,53,M32+,train


In [19]:
#making a list of train device ids
train_devideId_values = train_split.device_id.values

### Test data

In [20]:
#extracting test data
test_split = train_test_split[train_test_split['train_test_flag'] == "test"]

In [21]:
#first 5 rows
test_split.head()

Unnamed: 0,device_id,gender,age,group,train_test_flag
17481,2948104315232910000,F,65,F32+,test
17482,8231243155939480000,F,47,F32+,test
17483,-3994292212856080000,F,31,F25-32,test
17484,7217910398487470000,M,29,M25-32,test
17485,8642523170587800000,F,31,F25-32,test


In [22]:
#making a list of teat device ids 
test_deviveID_values = test_split.device_id.values

### Train-test datasets

In [23]:
#splitting scenario_1 into train and test sets
scenario1_train = scenario_1.loc[scenario_1['device_id'].isin(train_devideId_values)]
scenario1_test = scenario_1.loc[scenario_1['device_id'].isin(test_deviveID_values)]

In [24]:
#Print the size (rows and columns) in all the data frames
print("scenario1_train = ",scenario1_train.shape," Rows = ",scenario1_train.shape[0]," Columns = ",scenario1_train.shape[1])
print("scenario1_test = ",scenario1_test.shape," Rows = ",scenario1_test.shape[0]," Columns = ",scenario1_test.shape[1])

scenario1_train =  (17357, 320)  Rows =  17357  Columns =  320
scenario1_test =  (5787, 320)  Rows =  5787  Columns =  320


In [25]:
#first 5 rows
scenario1_train.head()

Unnamed: 0,device_id,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
0,-7548291590301750000,73.0,166.0,31.0,22.0,62.0,86.0,93.0,6.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,-1819925713085810000,13.0,16.0,7.0,4.0,12.0,5.0,6.0,4.0,3.0,...,0,0,0,0,0,0,0,0,1,0
2,3670076507269740000,28.0,56.0,15.0,12.0,21.0,18.0,19.0,17.0,7.0,...,0,0,0,0,0,0,0,0,0,1
3,5333872006968810000,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,5263633571423510000,10.0,27.0,0.0,0.0,3.0,8.0,6.0,4.0,9.0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
#first 5 rows
scenario1_test.head()

Unnamed: 0,device_id,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
5417,-3994292212856080000,0.0,5.0,0.0,3.0,3.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,1,0
5418,8836571545583980000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
5419,-727204878593826000,0.0,5.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,...,0,0,0,0,0,0,0,0,0,1
5420,8256066307270200000,45.0,115.0,22.0,25.0,16.0,30.0,29.0,47.0,33.0,...,0,0,0,0,0,0,0,0,1,0
5421,7850241797817570000,6.0,35.0,3.0,8.0,0.0,7.0,10.0,3.0,12.0,...,0,0,0,0,0,0,0,0,0,1


In [27]:
#Train dataset
X_train= scenario1_train.drop(['age'], axis=1)
y_train = scenario1_train[['age']]

In [28]:
#frist 5 rows
X_train.head()

Unnamed: 0,device_id,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
0,-7548291590301750000,73.0,166.0,31.0,22.0,62.0,86.0,93.0,6.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,-1819925713085810000,13.0,16.0,7.0,4.0,12.0,5.0,6.0,4.0,3.0,...,0,0,0,0,0,0,0,0,1,0
2,3670076507269740000,28.0,56.0,15.0,12.0,21.0,18.0,19.0,17.0,7.0,...,0,0,0,0,0,0,0,0,0,1
3,5333872006968810000,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
4,5263633571423510000,10.0,27.0,0.0,0.0,3.0,8.0,6.0,4.0,9.0,...,0,0,0,0,0,0,0,0,0,1


In [29]:
X_train= X_train.drop(X_train.iloc[:,0:1], axis=1)

In [30]:
#frist 5 rows
X_train.head()

Unnamed: 0,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
0,73.0,166.0,31.0,22.0,62.0,86.0,93.0,6.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,13.0,16.0,7.0,4.0,12.0,5.0,6.0,4.0,3.0,4.0,...,0,0,0,0,0,0,0,0,1,0
2,28.0,56.0,15.0,12.0,21.0,18.0,19.0,17.0,7.0,16.0,...,0,0,0,0,0,0,0,0,0,1
3,6.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,...,0,0,0,0,0,0,0,0,0,1
4,10.0,27.0,0.0,0.0,3.0,8.0,6.0,4.0,9.0,7.0,...,0,0,0,0,0,0,0,0,0,1


In [31]:
#first 5 rows
y_train.head()

Unnamed: 0,age
0,33
1,23
2,33
3,34
4,27


In [32]:
#Test dataset
X_test= scenario1_test.drop(['age'], axis=1)
y_test = scenario1_test[['age']]

In [33]:
#frist 5 rows
X_test.head()

Unnamed: 0,device_id,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
5417,-3994292212856080000,0.0,5.0,0.0,3.0,3.0,1.0,1.0,0.0,1.0,...,0,0,0,0,0,0,0,0,1,0
5418,8836571545583980000,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
5419,-727204878593826000,0.0,5.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,...,0,0,0,0,0,0,0,0,0,1
5420,8256066307270200000,45.0,115.0,22.0,25.0,16.0,30.0,29.0,47.0,33.0,...,0,0,0,0,0,0,0,0,1,0
5421,7850241797817570000,6.0,35.0,3.0,8.0,0.0,7.0,10.0,3.0,12.0,...,0,0,0,0,0,0,0,0,0,1


In [34]:
X_test= X_test.drop(X_test.iloc[:,0:1], axis=1)

In [35]:
#frist 5 rows
X_test.head()

Unnamed: 0,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
5417,0.0,5.0,0.0,3.0,3.0,1.0,1.0,0.0,1.0,2.0,...,0,0,0,0,0,0,0,0,1,0
5418,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,1,0
5419,0.0,5.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0,0,0,0,0,0,0,0,0,1
5420,45.0,115.0,22.0,25.0,16.0,30.0,29.0,47.0,33.0,31.0,...,0,0,0,0,0,0,0,0,1,0
5421,6.0,35.0,3.0,8.0,0.0,7.0,10.0,3.0,12.0,7.0,...,0,0,0,0,0,0,0,0,0,1


In [36]:
#first 5 rows
y_test.head()

Unnamed: 0,age
5417,31
5418,33
5419,22
5420,28
5421,26


### Exporting Test data

In [37]:
#Saving scenario 1 test data as csv
scenario1_test.to_csv('test_data.csv')

# Feature Scaling

In [38]:
#importing library
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train[['midnight_counts', 'daytime_counts', 'early_morning_counts', 'night_counts', 'monday_counts', 'tuesday_counts', 'wednesday_counts', 'thursday_counts', 'friday_counts', 'saturday_counts', 'sunday_counts']] = scaler.fit_transform(X_train[['midnight_counts', 'daytime_counts', 'early_morning_counts', 'night_counts', 'monday_counts', 'tuesday_counts', 'wednesday_counts', 'thursday_counts', 'friday_counts', 'saturday_counts', 'sunday_counts']])

X_train.head()

Unnamed: 0,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
0,0.090458,0.071768,0.04212,0.021675,0.022669,0.086,0.077824,0.007557,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,0.016109,0.006917,0.009511,0.003941,0.004388,0.005,0.005021,0.005038,0.003778,0.007737,...,0,0,0,0,0,0,0,0,1,0
2,0.034696,0.024211,0.02038,0.011823,0.007678,0.018,0.0159,0.021411,0.008816,0.030948,...,0,0,0,0,0,0,0,0,0,1
3,0.007435,0.003891,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019342,...,0,0,0,0,0,0,0,0,0,1
4,0.012392,0.011673,0.0,0.0,0.001097,0.008,0.005021,0.005038,0.011335,0.01354,...,0,0,0,0,0,0,0,0,0,1


# Model Building 

## Linear regression

##### The target variable "age" is a continuous variable,  so as a starting point and for simplicity, linear regression algorithm is considered. The modelling can also be done using multiclass logistic regression algorithm. Since most of the variables are categorical, logistic regression may get more complicated to execute. So to avoid the complexity and ease of understanding the metrics and execution the basic classic linear regression algorithm is opted for modelling.

### Training model

In [39]:
#list of X_train columns
col=list(X_train.columns)

In [40]:
#importing libraray
import statsmodels.api as sm

In [41]:
# initializing Linear regression object and fitting model
lr1 = sm.GLS(y_train,(sm.add_constant(X_train)))
lr1.fit().summary()

0,1,2,3
Dep. Variable:,age,R-squared:,0.14
Model:,GLS,Adj. R-squared:,0.125
Method:,Least Squares,F-statistic:,9.405
Date:,"Sun, 16 Jan 2022",Prob (F-statistic):,0.0
Time:,20:56:43,Log-Likelihood:,-62933.0
No. Observations:,17357,AIC:,126500.0
Df Residuals:,17061,BIC:,128800.0
Df Model:,295,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-9.464e+12,1.51e+13,-0.627,0.531,-3.91e+13,2.01e+13
midnight_counts,-5.046e+11,8.05e+11,-0.627,0.531,-2.08e+12,1.07e+12
daytime_counts,-1.446e+12,2.31e+12,-0.627,0.531,-5.97e+12,3.08e+12
early_morning_counts,-4.602e+11,7.35e+11,-0.627,0.531,-1.9e+12,9.8e+11
night_counts,-6.347e+11,1.01e+12,-0.627,0.531,-2.62e+12,1.35e+12
monday_counts,1.71e+12,2.73e+12,0.627,0.531,-3.64e+12,7.06e+12
tuesday_counts,6.253e+11,9.98e+11,0.627,0.531,-1.33e+12,2.58e+12
wednesday_counts,7.472e+11,1.19e+12,0.627,0.531,-1.59e+12,3.09e+12
thursday_counts,4.965e+11,7.92e+11,0.627,0.531,-1.06e+12,2.05e+12

0,1,2,3
Omnibus:,3716.067,Durbin-Watson:,1.988
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8447.431
Skew:,1.219,Prob(JB):,0.0
Kurtosis:,5.396,Cond. No.,3.63e+16


In [42]:
#Recursive feature elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()
rfe = RFE(linreg)
rfe = rfe.fit(X_train, y_train.values.ravel())
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True  True  True False False
  True  True False False False False False False False False  True  True
  True False False False False False False False False False  True False
 False False False False False False False False  True False False False
 False False False False False False False  True False  True False  True
 False  True False False False False  True  True False False False False
  True False  True  True False  True  True False False False False  True
 False False False False  True  True  True False  True False  True False
  True False False False False  True  True  True  True  True  True  True
  True False  True False False  True False  True  True  True False  True
 False  True  True False  True  True  True  True False False False  True
  True False  True  True  True  True False False  True False False  True
 False  True False False  True  True  True False  True  True  True  True
 False False  True False False  True False  True Fa

In [43]:
#list of  X_tain columns with rfe support and rankings 
list(zip(X_train.columns, rfe.support_, rfe.ranking_))

[('midnight_counts', True, 1),
 ('daytime_counts', True, 1),
 ('early_morning_counts', True, 1),
 ('night_counts', True, 1),
 ('monday_counts', True, 1),
 ('tuesday_counts', True, 1),
 ('wednesday_counts', True, 1),
 ('thursday_counts', True, 1),
 ('friday_counts', True, 1),
 ('saturday_counts', True, 1),
 ('sunday_counts', False, 14),
 ('bank financing', False, 66),
 ('community', True, 1),
 ('consumer finance', True, 1),
 ('convenience services', False, 72),
 ('cozy ', False, 24),
 ('debit and credit', False, 16),
 ('direct bank', False, 140),
 ('direct banking', False, 71),
 ('ds_p2p net loan', False, 131),
 ('fashion', False, 110),
 ('finance', False, 148),
 ('financial', True, 1),
 ('financial services', True, 1),
 ('health', True, 1),
 ('higher income', False, 7),
 ('im', False, 57),
 ('imf', False, 26),
 ('industry tag', False, 87),
 ('internet banking', False, 93),
 ('liquid medium', False, 92),
 ('low liquidity', False, 69),
 ('low profitability', False, 65),
 ('mobile bank', 

In [44]:
#X_train columns with rfe support
col_rfe = X_train.columns[rfe.support_]

In [45]:
#X_train columns without rfe support
X_train.columns[~rfe.support_]

Index(['sunday_counts', 'bank financing', 'convenience services', 'cozy ',
       'debit and credit', 'direct bank', 'direct banking', 'ds_p2p net loan',
       'fashion', 'finance',
       ...
       'cluster_79', 'cluster_81', 'cluster_82', 'cluster_87', 'cluster_88',
       'cluster_89', 'cluster_90', 'cluster_91', 'gender_F', 'gender_M'],
      dtype='object', length=159)

In [46]:
#Retraining model with new initialized linear regression object and fitting model
X_train_sm = sm.add_constant(X_train[col_rfe])
lr_final = sm.GLS(y_train,X_train_sm)
res = lr_final.fit()
res.summary()

0,1,2,3
Dep. Variable:,age,R-squared:,0.124
Model:,GLS,Adj. R-squared:,0.116
Method:,Least Squares,F-statistic:,15.34
Date:,"Sun, 16 Jan 2022",Prob (F-statistic):,0.0
Time:,20:59:00,Log-Likelihood:,-63090.0
No. Observations:,17357,AIC:,126500.0
Df Residuals:,17197,BIC:,127700.0
Df Model:,159,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,33.7648,0.143,236.539,0.000,33.485,34.045
midnight_counts,-13.9808,4.517,-3.095,0.002,-22.834,-5.128
daytime_counts,-1.4361,10.677,-0.134,0.893,-22.365,19.493
early_morning_counts,32.3747,4.779,6.774,0.000,23.007,41.742
night_counts,-2.3029,7.090,-0.325,0.745,-16.200,11.594
monday_counts,-19.0845,17.536,-1.088,0.276,-53.457,15.288
tuesday_counts,7.2299,7.084,1.021,0.307,-6.656,21.116
wednesday_counts,7.4638,8.308,0.898,0.369,-8.821,23.749
thursday_counts,4.6999,5.588,0.841,0.400,-6.254,15.654

0,1,2,3
Omnibus:,3755.196,Durbin-Watson:,1.987
Prob(Omnibus):,0.0,Jarque-Bera (JB):,8508.447
Skew:,1.233,Prob(JB):,0.0
Kurtosis:,5.385,Cond. No.,388.0


In [245]:
# Getting the predicted values on the train set
y_train_pred =res.predict(X_train_sm)
y_train_pred[:10]

0    35.942793
1    28.536455
2    30.059112
3    33.594904
4    32.962685
5    32.632218
6    29.802952
7    33.661697
8    29.587672
9    32.601177
dtype: float64

In [246]:
#Making dataframe with predictions array and y_train
y_train_pred_final = pd.DataFrame({'age':y_train.values.reshape(-1), 'age_pred':y_train_pred})
y_train_pred_final.head()

Unnamed: 0,age,age_pred
0,33,35.942793
1,23,28.536455
2,33,30.059112
3,34,33.594904
4,27,32.962685


In [247]:
#changing data type
y_train_pred_final['age_pred']=y_train_pred_final['age_pred'].astype('int')

In [248]:
#First 5 rows
y_train_pred_final.head()

Unnamed: 0,age,age_pred
0,33,35
1,23,28
2,33,30
3,34,33
4,27,32


In [250]:
#Taking a 25% sample from y_train data
y_train_lr=y_train_pred_final.sample(frac =.25)

In [251]:
#Calculating the percentage population distribution (PPD)
y_train_lr['percent_population_dist']=((y_train_lr['age']-y_train_lr['age_pred'])/y_train_lr['age'])*100

In [253]:
# importing r2_score module
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# predicting the accuracy score
score=r2_score(y_train,y_train_pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_train,y_train_pred)))
print('percentage population distribution is ', y_train_lr['percent_population_dist'].mean())

r2 socre is 0.09906516559343148
rmse is 9.30048851425341
percentage population distribution is  -5.283601229542258


### Test data

In [136]:
#first 5 rows
X_test.head()

Unnamed: 0,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
5417,0.0,5.0,0.0,3.0,3.0,1.0,1.0,0.0,1.0,2.0,...,0,0,0,0,0,0,0,0,1,0
5418,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0,0,0,0,0,0,0,0,1,0
5419,0.0,5.0,0.0,6.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0,0,0,0,0,0,0,0,0,1
5420,45.0,115.0,22.0,25.0,16.0,30.0,29.0,47.0,33.0,31.0,...,0,0,0,0,0,0,0,0,1,0
5421,6.0,35.0,3.0,8.0,0.0,7.0,10.0,3.0,12.0,7.0,...,0,0,0,0,0,0,0,0,0,1


In [137]:
#Feature Scaling
X_test[['midnight_counts', 'daytime_counts', 'early_morning_counts', 'night_counts', 'monday_counts', 'tuesday_counts', 'wednesday_counts', 'thursday_counts', 'friday_counts', 'saturday_counts', 'sunday_counts']] = scaler.fit_transform(X_test[['midnight_counts', 'daytime_counts', 'early_morning_counts', 'night_counts', 'monday_counts', 'tuesday_counts', 'wednesday_counts', 'thursday_counts', 'friday_counts', 'saturday_counts', 'sunday_counts']])

In [138]:
X_test.head()

Unnamed: 0,midnight_counts,daytime_counts,early_morning_counts,night_counts,monday_counts,tuesday_counts,wednesday_counts,thursday_counts,friday_counts,saturday_counts,...,cluster_84,cluster_85,cluster_86,cluster_87,cluster_88,cluster_89,cluster_90,cluster_91,gender_F,gender_M
5417,0.0,0.002643,0.0,0.004104,0.004412,0.001351,0.001038,0.0,0.002825,0.001538,...,0,0,0,0,0,0,0,0,1,0
5418,0.001036,0.000529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001538,...,0,0,0,0,0,0,0,0,1,0
5419,0.0,0.002643,0.0,0.008208,0.0,0.0,0.0,0.0,0.014124,0.0,...,0,0,0,0,0,0,0,0,0,1
5420,0.046632,0.060782,0.045082,0.0342,0.023529,0.040541,0.030114,0.095142,0.09322,0.023846,...,0,0,0,0,0,0,0,0,1,0
5421,0.006218,0.018499,0.006148,0.010944,0.0,0.009459,0.010384,0.006073,0.033898,0.005385,...,0,0,0,0,0,0,0,0,0,1


In [139]:
#Initializing Linear regression object and fitting the final model
X_test_sm = sm.add_constant(X_test[col_rfe])
lr_final = sm.GLS(y_test,X_test_sm)
res = lr_final.fit()
res.summary()

0,1,2,3
Dep. Variable:,age,R-squared:,0.126
Model:,GLS,Adj. R-squared:,0.108
Method:,Least Squares,F-statistic:,6.872
Date:,"Sun, 16 Jan 2022",Prob (F-statistic):,3.64e-96
Time:,20:24:25,Log-Likelihood:,-21134.0
No. Observations:,5787,AIC:,42510.0
Df Residuals:,5667,BIC:,43310.0
Df Model:,119,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,33.6727,0.253,132.949,0.000,33.176,34.169
midnight_counts,-13.7659,10.311,-1.335,0.182,-33.980,6.448
daytime_counts,5.0549,16.555,0.305,0.760,-27.399,37.508
early_morning_counts,22.9169,5.725,4.003,0.000,11.694,34.140
night_counts,-2.7736,9.793,-0.283,0.777,-21.972,16.425
monday_counts,-3.0513,10.371,-0.294,0.769,-23.382,17.280
tuesday_counts,4.7637,8.851,0.538,0.590,-12.588,22.116
wednesday_counts,-2.0392,11.480,-0.178,0.859,-24.544,20.466
thursday_counts,4.7824,6.933,0.690,0.490,-8.808,18.373

0,1,2,3
Omnibus:,1296.632,Durbin-Watson:,1.977
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2969.575
Skew:,1.263,Prob(JB):,0.0
Kurtosis:,5.436,Cond. No.,4.13e+16


In [254]:
#predictions on test data
y_test_pred = res.predict(X_test_sm)

In [255]:
#predictions array
y_test_pred[:10]

5417    31.976142
5418    32.589490
5419    24.523680
5420    34.190454
5421    25.477380
5422    35.320379
5423    34.873836
5424    30.920611
5425    33.678461
5426    31.014300
dtype: float64

In [256]:
#Making dataframe with predictions and y_test data
y_pred_final = pd.DataFrame({'age':y_test.values.reshape(-1), 'age_pred':y_test_pred})
y_pred_final.head()

Unnamed: 0,age,age_pred
5417,31,31.976142
5418,33,32.58949
5419,22,24.52368
5420,28,34.190454
5421,26,25.47738


In [257]:
#Changing data type
y_pred_final['age_pred']=y_pred_final['age_pred'].astype('int')

In [258]:
#first 5 rows
y_pred_final.head()

Unnamed: 0,age,age_pred
5417,31,31
5418,33,32
5419,22,24
5420,28,34
5421,26,25


In [259]:
#Taking a 25% sample from y_test data
y_test_lr=y_pred_final.sample(frac =.25)

In [260]:
#Calculating the percentage population distribution (PPD)
y_test_lr['percent_population_dist']=((y_test_lr['age']-y_test_lr['age_pred'])/y_test_lr['age'])*100

In [261]:
# predicting the accuracy score
score=r2_score(y_test,y_test_pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_test,y_test_pred)))
print('percentage population distribution is ', y_test_lr['percent_population_dist'].mean())

r2 socre is 0.1260991895074598
rmse is 9.327838430608452
percentage population distribution is  -6.3243654869924475


##### The results of Linear regression is pretty poor considering the low r squared value and less rmse value.
##### We can increase the model performance with XGBoost Regressor

## XGBoost regression

In [67]:
#Import required libraries
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [68]:
#get X_train, y_train
X_train= scenario1_train.drop(['age'], axis=1)
X_train= X_train.drop(X_train.iloc[:,0:1], axis=1)
y_train = scenario1_train[['age']]

In [69]:
#get X_test, y_test
X_test= scenario1_test.drop(['age'], axis=1)
X_test= X_test.drop(X_test.iloc[:,0:1], axis=1)
y_test = scenario1_test[['age']]

In [70]:
# A parameter grid for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        'n_estimators': range(60, 360, 40),
        'learning_rate': [0.1, 0.01, 0.05]
 }

##### Spliting this parameter grid into two for ease of execution of GridSearchCV. 

In [71]:
# A parameter grid for XGBoost
params1 = {
        'min_child_weight': [1, 5],
        'gamma': [0.5, 1, 1.5],
        'subsample': [0.6, 0.8],
        'colsample_bytree': [0.6, 0.8],
        'max_depth': [3, 4],
        'n_estimators': range(60, 140, 40),
        'learning_rate': [0.1, 0.01]
 }

In [72]:
#initializing XGB Regressor object
estimator = XGBRegressor(objective= 'reg:squarederror',eval_metric='rmse', seed=42)

In [73]:
#Initializing GrisSearchCV object for XGBoost regressor
reg = GridSearchCV(estimator=estimator, 
                   param_grid=params1,
                   scoring='r2',
                   n_jobs = 1,
                   verbose=50000)

In [74]:
#fitting the model
reg.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.112, total=   7.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.9s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.110, total=   7.3s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.2s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, ga

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.117, total=   5.6s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  2.9min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.125, total=   5.6s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  3.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8, score=0.113, total=   5.2s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  3.1min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.119, total=   6.5s
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  5.8min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.121, total=   6.5s
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed:  5.9min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.134, total=   6.4s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:  6.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.115, total=  11.3s
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed:  9.3min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.123, total=  11.4s
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed:  9.5min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.125, total=  11.3s
[Parallel(n_jobs=1)]: Done  74 out of  74 | elapsed:  9.7min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.396, total=   8.4s
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 12.5min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.351, total=   8.4s
[Parallel(n_jobs=1)]: Done  97 out of  97 | elapsed: 12.7min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.344, total=   8.3s
[Parallel(n_jobs=1)]: Done  98 out of  98 | elapsed: 12.8min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.8, score=-1.306, total=   8.3s
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 15.4min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.180, total=   6.8s
[Parallel(n_jobs=1)]: Done 121 out of 121 | elapsed: 15.6min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.103, total=   6.8s
[Parallel(n_jobs=1)]: Done 122 out of 122 | elapsed: 15.7min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01,

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.104, total=   6.9s
[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.018, total=   6.9s
[Parallel(n_jobs=1)]: Done 145 out of 145 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8, score=-3.180, total=   6.5s
[Parallel(n_jobs=1)]: Done 146 out of 146 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.01, 

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.112, total=   5.2s
[Parallel(n_jobs=1)]: Done 168 out of 168 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.113, total=   5.2s
[Parallel(n_jobs=1)]: Done 169 out of 169 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.124, total=   5.2s
[Parallel(n_jobs=1)]: Done 170 out of 170 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_chil

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.115, total=   9.1s
[Parallel(n_jobs=1)]: Done 192 out of 192 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.121, total=   9.1s
[Parallel(n_jobs=1)]: Done 193 out of 193 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.126, total=   9.0s
[Parallel(n_jobs=1)]: Done 194 out of 194 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=3, min

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.124, total=  10.5s
[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.114, total=  10.5s
[Parallel(n_jobs=1)]: Done 217 out of 217 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.122, total=  10.5s
[Parallel(n_jobs=1)]: Done 218 out of 218 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=4, min

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.139, total=  10.8s
[Parallel(n_jobs=1)]: Done 240 out of 240 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.184, total=   5.8s
[Parallel(n_jobs=1)]: Done 241 out of 241 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.108, total=   5.7s
[Parallel(n_jobs=1)]: Done 242 out of 242 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, 

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.107, total=   5.6s
[Parallel(n_jobs=1)]: Done 264 out of 264 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.023, total=   5.6s
[Parallel(n_jobs=1)]: Done 265 out of 265 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8, score=-3.184, total=   5.2s
[Parallel(n_jobs=1)]: Done 266 out of 266 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=3,

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.076, total=   6.5s
[Parallel(n_jobs=1)]: Done 288 out of 288 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.100, total=   6.4s
[Parallel(n_jobs=1)]: Done 289 out of 289 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.016, total=   6.4s
[Parallel(n_jobs=1)]: Done 290 out of 290 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4,

[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.343, total=  11.2s
[Parallel(n_jobs=1)]: Done 312 out of 312 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.337, total=  11.1s
[Parallel(n_jobs=1)]: Done 313 out of 313 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.337, total=  11.7s
[Parallel(n_jobs=1)]: Done 314 out of 314 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1, learning_rate=0.01, max_dep

[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.120, total=   9.0s
[Parallel(n_jobs=1)]: Done 336 out of 336 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.116, total=   8.5s
[Parallel(n_jobs=1)]: Done 337 out of 337 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.118, total=   8.5s
[Parallel(n_jobs=1)]: Done 338 out of 338 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_

[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.131, total=   8.4s
[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.119, total=   6.9s
[Parallel(n_jobs=1)]: Done 361 out of 361 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.112, total=   6.8s
[Parallel(n_jobs=1)]: Done 362 out of 362 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_dept

[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.122, total=   6.9s
[Parallel(n_jobs=1)]: Done 384 out of 384 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.139, total=   6.9s
[Parallel(n_jobs=1)]: Done 385 out of 385 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8, score=0.118, total=   6.5s
[Parallel(n_jobs=1)]: Done 386 out of 386 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.1, max_depth

[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.081, total=   5.3s
[Parallel(n_jobs=1)]: Done 408 out of 408 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.105, total=   5.2s
[Parallel(n_jobs=1)]: Done 409 out of 409 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.021, total=   5.2s
[Parallel(n_jobs=1)]: Done 410 out of 410 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, 

[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.351, total=   8.9s
[Parallel(n_jobs=1)]: Done 432 out of 432 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.344, total=   8.9s
[Parallel(n_jobs=1)]: Done 433 out of 433 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.343, total=   9.0s
[Parallel(n_jobs=1)]: Done 434 out of 434 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0

[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.389, total=  10.4s
[Parallel(n_jobs=1)]: Done 456 out of 456 | elapsed:    1.2s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.343, total=  10.9s
[Parallel(n_jobs=1)]: Done 457 out of 457 | elapsed:   12.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.337, total=  10.6s
[Parallel(n_jobs=1)]: Done 458 out of 458 | elapsed:   22.7s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=1.5, learning_rate=0

[CV]  colsample_bytree=0.6, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=-1.297, total=  10.4s
[Parallel(n_jobs=1)]: Done 480 out of 480 | elapsed:  3.6min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.112, total=   6.7s
[Parallel(n_jobs=1)]: Done 481 out of 481 | elapsed:  3.7min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.108, total=   6.7s
[Parallel(n_jobs=1)]: Done 482 out of 482 | elapsed:  3.9min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_de

[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.117, total=   6.9s
[Parallel(n_jobs=1)]: Done 504 out of 504 | elapsed:  7.0min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.123, total=   6.9s
[Parallel(n_jobs=1)]: Done 505 out of 505 | elapsed:  7.1min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8, score=0.113, total=   6.5s
[Parallel(n_jobs=1)]: Done 506 out of 506 | elapsed:  7.2min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth

[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.118, total=   7.9s
[Parallel(n_jobs=1)]: Done 528 out of 528 | elapsed: 10.5min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.121, total=   7.9s
[Parallel(n_jobs=1)]: Done 529 out of 529 | elapsed: 10.7min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.131, total=   7.9s
[Parallel(n_jobs=1)]: Done 530 out of 530 | elapsed: 10.8min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth

[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.116, total=  13.8s
[Parallel(n_jobs=1)]: Done 552 out of 552 | elapsed: 14.9min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.126, total=  13.9s
[Parallel(n_jobs=1)]: Done 553 out of 553 | elapsed: 15.1min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.125, total=  13.8s
[Parallel(n_jobs=1)]: Done 554 out of 554 | elapsed: 15.3min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.1, max_

[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.396, total=  10.2s
[Parallel(n_jobs=1)]: Done 576 out of 576 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.349, total=  10.6s
[Parallel(n_jobs=1)]: Done 577 out of 577 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.342, total=  10.4s
[Parallel(n_jobs=1)]: Done 578 out of 578 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0

[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.8, score=-1.305, total=  10.2s
[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.180, total=   8.4s
[Parallel(n_jobs=1)]: Done 601 out of 601 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.102, total=   8.5s
[Parallel(n_jobs=1)]: Done 602 out of 602 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01,

[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.099, total=   8.4s
[Parallel(n_jobs=1)]: Done 624 out of 624 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.016, total=   8.4s
[Parallel(n_jobs=1)]: Done 625 out of 625 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8, score=-3.180, total=   7.8s
[Parallel(n_jobs=1)]: Done 626 out of 626 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=0.5, learning_rate=0.01, 

[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.113, total=   6.3s
[Parallel(n_jobs=1)]: Done 648 out of 648 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.115, total=   6.3s
[Parallel(n_jobs=1)]: Done 649 out of 649 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=0.126, total=   6.3s
[Parallel(n_jobs=1)]: Done 650 out of 650 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_chil

[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.116, total=  10.9s
[Parallel(n_jobs=1)]: Done 672 out of 672 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.121, total=  10.9s
[Parallel(n_jobs=1)]: Done 673 out of 673 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=0.124, total=  11.0s
[Parallel(n_jobs=1)]: Done 674 out of 674 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=3, min

[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.119, total=  12.7s
[Parallel(n_jobs=1)]: Done 696 out of 696 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.114, total=  12.7s
[Parallel(n_jobs=1)]: Done 697 out of 697 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.121, total=  12.7s
[Parallel(n_jobs=1)]: Done 698 out of 698 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=4, min

[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.137, total=  12.9s
[Parallel(n_jobs=1)]: Done 720 out of 720 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.184, total=   6.7s
[Parallel(n_jobs=1)]: Done 721 out of 721 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.6, score=-3.106, total=   6.7s
[Parallel(n_jobs=1)]: Done 722 out of 722 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, 

[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.104, total=   6.7s
[Parallel(n_jobs=1)]: Done 744 out of 744 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.6, score=-3.021, total=   6.8s
[Parallel(n_jobs=1)]: Done 745 out of 745 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=60, subsample=0.8, score=-3.184, total=   6.3s
[Parallel(n_jobs=1)]: Done 746 out of 746 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=3,

[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.073, total=   8.1s
[Parallel(n_jobs=1)]: Done 768 out of 768 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.097, total=   7.9s
[Parallel(n_jobs=1)]: Done 769 out of 769 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.014, total=   7.9s
[Parallel(n_jobs=1)]: Done 770 out of 770 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4,

[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.342, total=  14.3s
[Parallel(n_jobs=1)]: Done 792 out of 792 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.336, total=  13.7s
[Parallel(n_jobs=1)]: Done 793 out of 793 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.334, total=  13.9s
[Parallel(n_jobs=1)]: Done 794 out of 794 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1, learning_rate=0.01, max_dep

[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.117, total=  10.1s
[Parallel(n_jobs=1)]: Done 816 out of 816 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.113, total=  10.2s
[Parallel(n_jobs=1)]: Done 817 out of 817 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=1, n_estimators=100, subsample=0.8, score=0.117, total=  10.2s
[Parallel(n_jobs=1)]: Done 818 out of 818 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_

[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.135, total=  10.2s
[Parallel(n_jobs=1)]: Done 840 out of 840 | elapsed:  1.1min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.119, total=   8.4s
[Parallel(n_jobs=1)]: Done 841 out of 841 | elapsed:  1.3min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=1, n_estimators=60, subsample=0.6, score=0.111, total=   8.4s
[Parallel(n_jobs=1)]: Done 842 out of 842 | elapsed:  1.4min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_dept

[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.121, total=   8.5s
[Parallel(n_jobs=1)]: Done 864 out of 864 | elapsed:  5.3min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.6, score=0.134, total=   8.4s
[Parallel(n_jobs=1)]: Done 865 out of 865 | elapsed:  5.5min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=60, subsample=0.8, score=0.118, total=   7.9s
[Parallel(n_jobs=1)]: Done 866 out of 866 | elapsed:  5.6min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.1, max_depth

[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.079, total=   6.3s
[Parallel(n_jobs=1)]: Done 888 out of 888 | elapsed:  9.2min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.101, total=   6.3s
[Parallel(n_jobs=1)]: Done 889 out of 889 | elapsed:  9.3min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=1, n_estimators=60, subsample=0.8, score=-3.020, total=   6.3s
[Parallel(n_jobs=1)]: Done 890 out of 890 | elapsed:  9.4min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, 

[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.350, total=  11.0s
[Parallel(n_jobs=1)]: Done 912 out of 912 | elapsed: 12.7min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.343, total=  10.9s
[Parallel(n_jobs=1)]: Done 913 out of 913 | elapsed: 12.9min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=3, min_child_weight=5, n_estimators=100, subsample=0.6, score=-1.341, total=  10.9s
[Parallel(n_jobs=1)]: Done 914 out of 914 | elapsed: 13.0min remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0

[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.388, total=  12.8s
[Parallel(n_jobs=1)]: Done 936 out of 936 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.342, total=  12.9s
[Parallel(n_jobs=1)]: Done 937 out of 937 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8, score=-1.334, total=  12.8s
[Parallel(n_jobs=1)]: Done 938 out of 938 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.8, gamma=1.5, learning_rate=0

[CV]  colsample_bytree=0.8, gamma=1.5, learning_rate=0.01, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=-1.297, total=  13.1s
[Parallel(n_jobs=1)]: Done 960 out of 960 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 960 out of 960 | elapsed:    0.0s finished


GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False,
                                    eval_metric='rmse', gamma=None, gpu_id=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_cons...
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    seed=42, subsample=None, tree_method=None,
                                    validate

In [75]:
print("Best parameters:", reg.best_params_)
print("Highest r2: ", reg.best_score_)

Best parameters: {'colsample_bytree': 0.6, 'gamma': 0.5, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 5, 'n_estimators': 100, 'subsample': 0.8}
Highest r2:  0.1254870730190733


In [76]:
# A parameter grid for XGBoost
params2 = {
        'min_child_weight': [10],
        'gamma': [ 2, 5],
        'subsample': [ 1.0],
        'colsample_bytree': [ 1.0],
        'max_depth': [ 5],
        'n_estimators': range(140, 360, 40),
        'learning_rate': [ 0.05]
 }

In [77]:
#Initializing GrisSearchCV object for XGBoost regressor
reg2 = GridSearchCV(estimator=estimator, 
                   param_grid=params2,
                   scoring='r2', 
                   n_jobs = 1 ,
                   verbose=50000)

In [78]:
#fitting the model
reg2.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.121, total=  22.8s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   22.8s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.117, total=  22.1s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   44.9s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0,

[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=300, subsample=1.0, score=0.126, total=  46.8s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 13.8min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=300, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=300, subsample=1.0, score=0.133, total=  47.4s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed: 14.5min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=340, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=340, subsample=1.0, score=0.122, total=  53.7s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed: 15.4min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_d

[CV]  colsample_bytree=1.0, gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=260, subsample=1.0, score=0.121, total=  41.5s
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 28.3min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=260, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=260, subsample=1.0, score=0.126, total=  41.4s
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed: 29.0min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=260, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=260, subsample=1.0, score=0.133, total=  41.7s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 29.7min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=5, learning_rate=0.05, max_d

GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False,
                                    eval_metric='rmse', gamma=None, gpu_id=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_cons...
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    seed=42, subsample=None, tree_method=None,
                                    validate

In [79]:
print("Best parameters:", reg2.best_params_)
print("Highest r2: ", reg2.best_score_)

Best parameters: {'colsample_bytree': 1.0, 'gamma': 2, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 10, 'n_estimators': 180, 'subsample': 1.0}
Highest r2:  0.12403820678876025


##### Rerunning the Gridsearch for getting the best of the two GridSearches done above

In [81]:
# A parameter grid for XGBoost
params3 = {
        'min_child_weight': [5,10],
        'gamma': [0.5, 2],
        'subsample': [ 0.8,1.0],
        'colsample_bytree': [ 0.6, 1.0],
        'max_depth': [4, 5],
        'n_estimators': range(100, 180,40),
        'learning_rate': [0.1, 0.05]
 }

In [82]:
#Initializing GrisSearchCV object for XGBoost regressor
reg3 = GridSearchCV(estimator=estimator, 
                   param_grid=params3,
                   scoring='r2', 
                   n_jobs = 1 ,
                   verbose=50000)

In [83]:
#Fitting the model
reg3.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.123, total=   9.2s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.1s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.115, total=   9.1s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.2s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.129, total=  11.1s
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:  5.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.137, total=   9.3s
[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  5.1min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.123, total=   8.4s
[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:  5.3min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1,

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.124, total=  10.6s
[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed: 10.2min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.125, total=  10.8s
[Parallel(n_jobs=1)]: Done  49 out of  49 | elapsed: 10.4min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.138, total=  10.9s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed: 10.6min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.109, total=  16.9s
[Parallel(n_jobs=1)]: Done  72 out of  72 | elapsed: 16.7min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.122, total=  17.9s
[Parallel(n_jobs=1)]: Done  73 out of  73 | elapsed: 17.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.125, total=  17.0s
[Parallel(n_jobs=1)]: Done  74 out of  74 | elapsed: 17.3min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.1,

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.119, total=  11.6s
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed: 22.8min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.115, total=  11.9s
[Parallel(n_jobs=1)]: Done  97 out of  97 | elapsed: 23.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.120, total=  12.1s
[Parallel(n_jobs=1)]: Done  98 out of  98 | elapsed: 23.2min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.05

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.124, total=  20.7s
[Parallel(n_jobs=1)]: Done 119 out of 119 | elapsed: 28.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.132, total=  23.3s
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed: 28.4min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.126, total=  17.3s
[Parallel(n_jobs=1)]: Done 121 out of 121 | elapsed: 28.7min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0

[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.115, total=  13.8s
[Parallel(n_jobs=1)]: Done 142 out of 142 | elapsed: 34.1min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.121, total=  12.4s
[Parallel(n_jobs=1)]: Done 143 out of 143 | elapsed: 34.3min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.126, total=  12.6s
[Parallel(n_jobs=1)]: Done 144 out of 144 | elapsed: 34.5min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=0.5, learning_rate

[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.139, total=  11.8s
[Parallel(n_jobs=1)]: Done 165 out of 165 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.122, total=  10.2s
[Parallel(n_jobs=1)]: Done 166 out of 166 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.116, total=   8.9s
[Parallel(n_jobs=1)]: Done 167 out of 167 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min

[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.123, total=  12.0s
[Parallel(n_jobs=1)]: Done 189 out of 189 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.133, total=  10.3s
[Parallel(n_jobs=1)]: Done 190 out of 190 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.124, total=  24.7s
[Parallel(n_jobs=1)]: Done 191 out of 191 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=4

[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.119, total=  17.2s
[Parallel(n_jobs=1)]: Done 213 out of 213 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.123, total=  16.1s
[Parallel(n_jobs=1)]: Done 214 out of 214 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.135, total=  16.6s
[Parallel(n_jobs=1)]: Done 215 out of 215 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min

[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.111, total=  17.7s
[Parallel(n_jobs=1)]: Done 237 out of 237 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.123, total=  25.9s
[Parallel(n_jobs=1)]: Done 238 out of 238 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.122, total=  27.8s
[Parallel(n_jobs=1)]: Done 239 out of 239 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.1, max_depth=5

[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.118, total=  10.1s
[Parallel(n_jobs=1)]: Done 261 out of 261 | elapsed:  3.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.112, total=  10.1s
[Parallel(n_jobs=1)]: Done 262 out of 262 | elapsed:  3.1min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.116, total=  10.2s
[Parallel(n_jobs=1)]: Done 263 out of 263 | elapsed:  3.3min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_d

[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.132, total=  12.3s
[Parallel(n_jobs=1)]: Done 285 out of 285 | elapsed:  7.7min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.118, total=  10.8s
[Parallel(n_jobs=1)]: Done 286 out of 286 | elapsed:  7.9min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.116, total=  10.7s
[Parallel(n_jobs=1)]: Done 287 out of 287 | elapsed:  8.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=

[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.122, total=  10.7s
[Parallel(n_jobs=1)]: Done 309 out of 309 | elapsed: 13.0min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.133, total=  10.6s
[Parallel(n_jobs=1)]: Done 310 out of 310 | elapsed: 13.1min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.129, total=  16.9s
[Parallel(n_jobs=1)]: Done 311 out of 311 | elapsed: 13.4min remaining:    0.0s
[CV] colsample_bytree=0.6, gamma=2, learning_rate=0.05, max_d

[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.120, total=  20.7s
[Parallel(n_jobs=1)]: Done 333 out of 333 | elapsed: 19.2min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.131, total=  20.7s
[Parallel(n_jobs=1)]: Done 334 out of 334 | elapsed: 19.5min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.136, total=  20.7s
[Parallel(n_jobs=1)]: Done 335 out of 335 | elapsed: 19.9min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_

[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.115, total=  18.3s
[Parallel(n_jobs=1)]: Done 357 out of 357 | elapsed: 26.0min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.125, total=  17.8s
[Parallel(n_jobs=1)]: Done 358 out of 358 | elapsed: 26.3min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.123, total=  17.7s
[Parallel(n_jobs=1)]: Done 359 out of 359 | elapsed: 26.6min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1,

[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.117, total=  18.2s
[Parallel(n_jobs=1)]: Done 381 out of 381 | elapsed: 33.9min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.114, total=  18.6s
[Parallel(n_jobs=1)]: Done 382 out of 382 | elapsed: 34.2min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=0.8, score=0.119, total=  18.0s
[Parallel(n_jobs=1)]: Done 383 out of 383 | elapsed: 34.5min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.1,

[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.128, total=  15.0s
[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.114, total=  13.0s
[Parallel(n_jobs=1)]: Done 406 out of 406 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.112, total=  13.4s
[Parallel(n_jobs=1)]: Done 407 out of 407 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05

[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.116, total=  13.0s
[Parallel(n_jobs=1)]: Done 428 out of 428 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.120, total=  13.0s
[Parallel(n_jobs=1)]: Done 429 out of 429 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.125, total=  13.0s
[Parallel(n_jobs=1)]: Done 430 out of 430 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate

[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.120, total=  25.3s
[Parallel(n_jobs=1)]: Done 451 out of 451 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.116, total=  25.2s
[Parallel(n_jobs=1)]: Done 452 out of 452 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.126, total=  25.6s
[Parallel(n_jobs=1)]: Done 453 out of 453 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05

[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.128, total=  25.4s
[Parallel(n_jobs=1)]: Done 474 out of 474 | elapsed:  2.9min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.138, total=  25.9s
[Parallel(n_jobs=1)]: Done 475 out of 475 | elapsed:  3.3min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=0.5, learning_rate=0.05, max_depth=5, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.121, total=  22.2s
[Parallel(n_jobs=1)]: Done 476 out of 476 | elapsed:  3.7min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=0.5, learning_rate

[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.121, total=  17.7s
[Parallel(n_jobs=1)]: Done 498 out of 498 | elapsed: 10.1min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.125, total=  17.7s
[Parallel(n_jobs=1)]: Done 499 out of 499 | elapsed: 10.4min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=4, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.133, total=  17.7s
[Parallel(n_jobs=1)]: Done 500 out of 500 | elapsed: 10.7min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=4, min

[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.113, total=  18.6s
[Parallel(n_jobs=1)]: Done 522 out of 522 | elapsed: 16.8min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.115, total=  18.5s
[Parallel(n_jobs=1)]: Done 523 out of 523 | elapsed: 17.1min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=5, n_estimators=100, subsample=0.8, score=0.129, total=  18.8s
[Parallel(n_jobs=1)]: Done 524 out of 524 | elapsed: 17.5min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min

[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.121, total=  16.0s
[Parallel(n_jobs=1)]: Done 546 out of 546 | elapsed: 24.8min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.115, total=  15.6s
[Parallel(n_jobs=1)]: Done 547 out of 547 | elapsed: 25.0min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5, min_child_weight=10, n_estimators=100, subsample=1.0, score=0.122, total=  15.7s
[Parallel(n_jobs=1)]: Done 548 out of 548 | elapsed: 25.3min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.1, max_depth=5

[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=100, subsample=1.0, score=0.126, total=  13.0s
[Parallel(n_jobs=1)]: Done 570 out of 570 | elapsed: 32.1min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.120, total=  20.8s
[Parallel(n_jobs=1)]: Done 571 out of 571 | elapsed: 32.4min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=5, n_estimators=140, subsample=0.8, score=0.115, total=  21.3s
[Parallel(n_jobs=1)]: Done 572 out of 572 | elapsed: 32.8min remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=

[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.128, total=  20.7s
[Parallel(n_jobs=1)]: Done 594 out of 594 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=0.8 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=0.8, score=0.134, total=  20.7s
[Parallel(n_jobs=1)]: Done 595 out of 595 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=4, min_child_weight=10, n_estimators=140, subsample=1.0, score=0.121, total=  18.0s
[Parallel(n_jobs=1)]: Done 596 out of 596 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_d

[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.120, total=  21.8s
[Parallel(n_jobs=1)]: Done 618 out of 618 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.126, total=  22.2s
[Parallel(n_jobs=1)]: Done 619 out of 619 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=1.0 
[CV]  colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=140, subsample=1.0, score=0.133, total=  21.7s
[Parallel(n_jobs=1)]: Done 620 out of 620 | elapsed:    0.0s remaining:    0.0s
[CV] colsample_bytree=1.0, gamma=2, learning_rate=0.05, max_depth=

GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False,
                                    eval_metric='rmse', gamma=None, gpu_id=None,
                                    importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_cons...
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                    seed=42, subsample=None, tree_method=None,
                                    validate

In [84]:
print("Best parameters:", reg3.best_params_)
print("Highest r2: ", reg3.best_score_)

Best parameters: {'colsample_bytree': 0.6, 'gamma': 2, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 10, 'n_estimators': 140, 'subsample': 0.8}
Highest r2:  0.1270604834759216


In [85]:
#initialiazing the XGBoost regressor object with the best parameters
xgb = XGBRegressor(n_estimators=140,subsample=0.8,min_child_weight=10,\
                          max_depth=5,learning_rate=0.05,gamma=2,colsample_bytree=0.6,\
                          objective= 'reg:squarederror',eval_metric='rmse', seed=42)

In [336]:
#fitting the model
xgb.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, enable_categorical=False,
             eval_metric='rmse', gamma=2, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=5, min_child_weight=10, missing=nan,
             monotone_constraints='()', n_estimators=140, n_jobs=2,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
             subsample=0.8, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [337]:
# make predictions for train data
y_train_pred = xgb.predict(X_train)

In [338]:
#prediction array
y_train_pred[:10]

array([35.59073 , 31.050735, 31.635977, 29.683365, 30.349846, 28.545547,
       30.099665, 32.79462 , 29.412031, 30.597694], dtype=float32)

In [339]:
#predictions on train data as a dataframe
y_pred_final = pd.DataFrame({'age':y_train.values.reshape(-1), 'age_pred':y_train_pred})
y_pred_final.head()

Unnamed: 0,age,age_pred
0,33,35.590729
1,23,31.050735
2,33,31.635977
3,34,29.683365
4,27,30.349846


In [340]:
#changiging data type
y_pred_final['age_pred']=y_train_pred_final['age_pred'].astype('int')

In [341]:
#first 5 rows
y_pred_final.head()

Unnamed: 0,age,age_pred
0,33,35
1,23,31
2,33,31
3,34,29
4,27,30


In [342]:
#Taking a 25% sample data of train data
y_train_xgb=y_pred_final.sample(frac =.25)

In [343]:
#Calaculating PPD
y_train_xgb['percent_population_dist']=((y_train_xgb['age']-y_train_xgb['age_pred'])/y_train_xgb['age'])*100

In [344]:
# predicting the scores
score=r2_score(y_train,y_train_pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_train,y_train_pred)))
print('percentage population distribution is ', y_train_xgb['percent_population_dist'].mean())

r2 socre is 0.2037705956192255
rmse is 8.743356309133395
percentage population distribution is  -4.910964826805002


In [345]:
# make predictions for test data
y_pred = xgb.predict(X_test)

In [346]:
#predictions array
y_pred[:10]

array([32.595417, 34.27413 , 27.383434, 32.70584 , 26.71593 , 32.944824,
       30.528963, 32.394268, 32.432907, 29.70813 ], dtype=float32)

In [347]:
#Making a dataframe with predictions and y_test data
y_prad_final = pd.DataFrame({'age':y_test.values.reshape(-1), 'age_pred':y_pred})

In [348]:
#Changing data type
y_pred_final['age_pred']=y_pred_final['age_pred'].astype('int')

In [349]:
#First 5 rows
y_pred_final.head()

Unnamed: 0,age,age_pred
0,33,35
1,23,31
2,33,31
3,34,29
4,27,30


In [350]:
#Taking 25% sample of test data
y_test_xgb=y_pred_final.sample(frac =.25)

In [351]:
#Calculating PPD
y_test_xgb['percent_population_dist']=((y_test_xgb['age']-y_test_xgb['age_pred'])/y_test_xgb['age'])*100

In [354]:
# predicting the accuracy score
score=r2_score(y_pred_final.age,y_pred_final.age_pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_pred_final.age,y_pred_final.age_pred)))
print('percentage population distribution is ', y_test_xgb['percent_population_dist'].mean())

r2 socre is 0.19983953374460695
rmse is 8.76491313434956
percentage population distribution is  -5.478761628439787


 ##### This model is better compared to linear regression and is still very poor values of r squared and rmse.

# Stacking Model

In [103]:
#get X_train, y_train
X_train= scenario1_train.drop(['age'], axis=1)
X_train= X_train.drop(X_train.iloc[:,0:1], axis=1)
y_train = scenario1_train[['age']]

In [104]:
#get X_test, y_test
X_test= scenario1_test.drop(['age'], axis=1)
X_test= X_test.drop(X_test.iloc[:,0:1], axis=1)
y_test = scenario1_test[['age']]

Let's find the best parameters for random forest

In [105]:
#importing libraraies
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [106]:
#initializing Random Forest Regression object
rfr=RandomForestRegressor(random_state=42)

In [107]:
#Parameter grid for random forest
param_grid = { 
    'n_estimators': [60,100,140,180,220],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
}

In [108]:
#Running GridSearchCV to find the best potamaters for RF REgressor
CV_rfr = GridSearchCV(estimator=rfr, param_grid=param_grid, scoring='r2', cv= 5)
CV_rfr.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [60, 100, 140, 180, 220]},
             scoring='r2')

In [109]:
print("Best parameters:", CV_rfr.best_params_)
print("Highest r2: ", CV_rfr.best_score_)

Best parameters: {'max_depth': 8, 'max_features': 'auto', 'n_estimators': 180}
Highest r2:  0.10148334825957364


In [110]:
#Build random forest model with best parameters
rfr1=RandomForestRegressor(random_state=42, max_features='auto', n_estimators= 180, max_depth=8)

In [111]:
#fitting the model
rfr1.fit(X_train, y_train)

RandomForestRegressor(max_depth=8, n_estimators=180, random_state=42)

In [112]:
#making predictions
pred=rfr1.predict(X_test)

In [113]:
# predicting the accuracy score
score=r2_score(y_test,pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_test,pred)))

r2 socre is 0.10270474605130542
rmse is 9.451867594546037


In [114]:
#importing libraries
from sklearn import preprocessing
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import LinearRegression
from sklearn import model_selection

In [115]:
# x,y to np 
X_train_np = X_train.values
y_train_np = y_train.values

In [116]:
#initializing linear regression object for  stacking 
lr1 = LinearRegression()

#Initializing StackingCVRegressor object
stack = StackingCVRegressor(regressors=[lr1, rfr1],meta_regressor= xgb, use_features_in_secondary=True)

In [117]:
#running each model and finding its accuracy scores
for lr, label in zip([lr1, rfr1, xgb], 
                      ['lr', 
                       'Random Forest', 
                       'xgb']):

    scores = model_selection.cross_val_score(lr, X_train, y_train, cv=3, scoring='r2')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.11 (+/- 0.01) [lr]
Accuracy: 0.10 (+/- 0.00) [Random Forest]
Accuracy: 0.12 (+/- 0.00) [xgb]


In [171]:
# Fit on train data 
slr_fit = stack.fit(X_train_np, y_train_np.reshape(-1))

In [355]:
#prediction on train data
train_pred = slr_fit.predict(X_train)

In [356]:
#predictions on train data as a dataframe
y_pred_final= pd.DataFrame({'age':y_train.values.reshape(-1), 'age_pred':train_pred})
y_pred_final.head()

Unnamed: 0,age,age_pred
0,33,36.812634
1,23,29.736115
2,33,32.343845
3,34,28.62125
4,27,29.694384


In [357]:
#changiging data type
y_pred_final['age_pred']=y_pred_final['age_pred'].astype('int')

In [358]:
#Taking a 25% train data sample
y_train_slr=y_pred_final.sample(frac =.25)

In [359]:
#Calcualting PPD
y_train_slr['percent_population_dist']=((y_train_slr['age']-y_train_slr['age_pred'])/y_train_slr['age'])*100

In [360]:
# predicting the accuracy score
score=r2_score(y_train,train_pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_train, train_pred)))
print('percentage population distribution is ', y_train_slr['percent_population_dist'].mean())

r2 socre is 0.22469761929794607
rmse is 8.627691943188788
percentage population distribution is  -4.965283053625107


In [361]:
#Prediction on test data
test_pred = slr_fit.predict(X_test)

In [362]:
#Predictions array
test_pred[:10]

array([36.872097, 28.210339, 30.00628 , 36.212715, 28.407251, 27.427729,
       37.54959 , 35.59576 , 35.371136, 32.293037], dtype=float32)

In [363]:
#predictions on test data as a dataframe
y_prad_final = pd.DataFrame({'age':y_test.values.reshape(-1), 'age_pred':test_pred})

In [364]:
#changiging data type
y_pred_final['age_pred']=y_pred_final['age_pred'].astype('int')

In [365]:
#Taking 25% of test data
y_test_slr=y_pred_final.sample(frac =.25)

In [366]:
#Calculating PPD
y_test_slr['percent_population_dist']=((y_test_slr['age']-y_test_slr['age_pred'])/y_test_slr['age'])*100

In [367]:
# predicting the accuracy score
score=r2_score(y_pred_final.age,y_pred_final.age_pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_pred_final.age,y_pred_final.age_pred)))
print('percentage population distribution is ', y_test_slr['percent_population_dist'].mean())

r2 socre is 0.22168654779256913
rmse is 8.644429556381711
percentage population distribution is  -4.487688691930884


##### Stacking model has given best results as compared all . No results are good and acceptable, but out of three, stacking model is opted for deployment. 

# Model Pickling

In [368]:
#importing library
import pickle

In [373]:
# save the model to disk
filename = 'age_model.pkl'
pickle.dump(slr_fit, open(filename, 'wb'))

In [374]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(X_test)

#predictions on test data as a dataframe
y_prad_final = pd.DataFrame({'age':y_test.values.reshape(-1), 'age_pred':result})

#changiging data type
y_pred_final['age_pred']=y_pred_final['age_pred'].astype('int')

#Calculating PPD
y_pred_final['percent_population_dist']=((y_pred_final['age']-y_pred_final['age_pred'])/y_pred_final['age'])*100

y_pred_final.head()

Unnamed: 0,age,age_pred,percent_population_dist
0,33,36,-9.090909
1,23,29,-26.086957
2,33,32,3.030303
3,34,28,17.647059
4,27,29,-7.407407


In [375]:
# predicting the accuracy score
score=r2_score(y_pred_final.age,y_pred_final.age_pred)
print('r2 socre is',score)
print('rmse is',np.sqrt(mean_squared_error(y_pred_final.age,y_pred_final.age_pred)))

y_test_slr=y_pred_final.sample(frac =.25)
print('percentage population distribution is ',y_test_slr['percent_population_dist'].mean())

r2 socre is 0.22168654779256913
rmse is 8.644429556381711
percentage population distribution is  -4.597885759445563
