In [31]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime, date
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import StratifiedShuffleSplit

import argparse

In [32]:
print('Loading raw data...')
train_users_path='train_users_2.csv'
test_users_path='test_users.csv'
sessions_path='sessions.csv'

#Note: age_gender_bkts.csv and countries.csv files are not used.

#########Loading data#############
#train_users
df_train = pd.read_csv(train_users_path)
target = df_train['country_destination']
df_train = df_train.drop(['country_destination'], axis=1)

#test_users
df_test = pd.read_csv(test_users_path)    
id_test = df_test['id']

#sessions
df_sessions = pd.read_csv(sessions_path)
df_sessions['id'] = df_sessions['user_id']
df_sessions = df_sessions.drop(['user_id'],axis=1)

Loading raw data...


In [33]:
df_train.iloc[0]

id                             gxn3p5htnn
date_account_created           2010-06-28
timestamp_first_active     20090319043255
date_first_booking                    NaN
gender                          -unknown-
age                                   NaN
signup_method                    facebook
signup_flow                             0
language                               en
affiliate_channel                  direct
affiliate_provider                 direct
first_affiliate_tracked         untracked
signup_app                            Web
first_device_type             Mac Desktop
first_browser                      Chrome
Name: 0, dtype: object

In [34]:
df_sessions.iloc[0]

action                    lookup
action_type                  NaN
action_detail                NaN
device_type      Windows Desktop
secs_elapsed                 319
id                    d1mm9tcy42
Name: 0, dtype: object

In [35]:
#########Preparing Session data########
print('Working on Session data...')
#Filling nan with specific value ('NAN')
df_sessions.action = df_sessions.action.fillna('NAN')
df_sessions.action_type = df_sessions.action_type.fillna('NAN')
df_sessions.action_detail = df_sessions.action_detail.fillna('NAN')
df_sessions.device_type = df_sessions.device_type.fillna('NAN')

Working on Session data...


In [55]:
df_sessions.head()

Unnamed: 0,action,action_type,action_detail,device_type,secs_elapsed,id
0,lookup,NAN,NAN,Windows Desktop,319.0,d1mm9tcy42
1,search_results,click,view_search_results,Windows Desktop,67753.0,d1mm9tcy42
2,lookup,NAN,NAN,Windows Desktop,301.0,d1mm9tcy42
3,search_results,click,view_search_results,Windows Desktop,22141.0,d1mm9tcy42
4,lookup,NAN,NAN,Windows Desktop,435.0,d1mm9tcy42


In [37]:
#np.unique(df_sessions.action, return_counts=True)

In [38]:
act = dict(zip(*np.unique(df_sessions.action, return_counts=True)))
act_freq = 100
df_sessions.action = df_sessions.action.apply(lambda x: 'OTHER' if act[x] < act_freq else x)

In [44]:
act

{'10': 3215,
 '11': 716,
 '12': 2209,
 '15': 1053,
 'NAN': 79626,
 'about_us': 416,
 'accept_decline': 2,
 'account': 9040,
 'acculynk_bin_check_failed': 1,
 'acculynk_bin_check_success': 51,
 'acculynk_load_pin_pad': 50,
 'acculynk_pin_pad_error': 4,
 'acculynk_pin_pad_inactive': 30,
 'acculynk_pin_pad_success': 5,
 'acculynk_session_obtained': 52,
 'active': 188036,
 'add_business_address_colorbox': 9,
 'add_guest_colorbox': 7,
 'add_guests': 60,
 'add_note': 961,
 'agree_terms_check': 10938,
 'agree_terms_uncheck': 598,
 'airbnb_picks': 278,
 'airbrb': 3,
 'ajax_check_dates': 52517,
 'ajax_get_referrals_amt': 11306,
 'ajax_get_results': 369,
 'ajax_google_translate': 290,
 'ajax_google_translate_description': 933,
 'ajax_google_translate_reviews': 951,
 'ajax_image_upload': 13570,
 'ajax_ldp': 19,
 'ajax_lwlb_contact': 33413,
 'ajax_payout_edit': 714,
 'ajax_payout_options_by_country': 685,
 'ajax_payout_split_edit': 7,
 'ajax_photo_widget': 8,
 'ajax_photo_widget_form_iframe': 2444

In [40]:
#Computing value_counts. These are going to be used in the one-hot encoding
#based feature generation (following loop).
f_act = df_sessions.action.value_counts().argsort()
f_act_detail = df_sessions.action_detail.value_counts().argsort()
f_act_type = df_sessions.action_type.value_counts().argsort()
f_dev_type = df_sessions.device_type.value_counts().argsort()
#print (df_sessions.action.value_counts())

In [54]:
f_act

show                                       230
index                                      229
search_results                             227
personalize                                228
search                                     226
ajax_refresh_subtotal                      225
update                                     224
similar_listings                           223
social_connections                         222
reviews                                    221
active                                     220
similar_listings_v2                        219
lookup                                     218
create                                     217
dashboard                                  216
header_userpic                             215
collections                                214
edit                                       213
campaigns                                  212
track_page_view                            211
NAN                                        210
unavailabilit

In [41]:
#grouping session by id. We will compute features from all rows with the same id.
dgr_sess = df_sessions.groupby(['id'])

In [57]:
#for key, item in dgr_sess:
#    print (dgr_sess.get_group(key), "\n\n")
dgr_sess.head()

Unnamed: 0,action,action_type,action_detail,device_type,secs_elapsed,id
0,lookup,NAN,NAN,Windows Desktop,319.0,d1mm9tcy42
1,search_results,click,view_search_results,Windows Desktop,67753.0,d1mm9tcy42
2,lookup,NAN,NAN,Windows Desktop,301.0,d1mm9tcy42
3,search_results,click,view_search_results,Windows Desktop,22141.0,d1mm9tcy42
4,lookup,NAN,NAN,Windows Desktop,435.0,d1mm9tcy42
127,dashboard,view,dashboard,Mac Desktop,2739.0,yo8nz8bqcq
128,create,submit,create_user,Mac Desktop,,yo8nz8bqcq
129,confirm_email,click,confirm_email_link,Mac Desktop,115983.0,yo8nz8bqcq
130,show,view,p3,Mac Desktop,20285.0,yo8nz8bqcq
131,show_personalize,data,user_profile_content_update,Mac Desktop,3255.0,yo8nz8bqcq


In [71]:
#dgr_sess
#sample = dgr_sess.head()
#sample
#for g in sample:
#    print (g[1])
#    print (g[0])
    #print(g.secs_elapsed)

In [88]:
#Loop on dgr_sess to create all the features.
samples = []
cont = 0
ln = len(dgr_sess)
for g in dgr_sess:
    if cont%10000 == 0:
        print("%s from %s" %(cont, ln))
    #gr = g[1]
    #print (gr)
    l = []
    
    #the id
    l.append(g[0])
    #print (g[0])
    #The actual first feature is the number of values.
    l.append(len(gr))
    
    sev = gr.secs_elapsed.fillna(0).values   #These values are used later.
    
    #action features
    #(how many times each value occurs, numb of unique values, mean and std)
    c_act = [0] * len(f_act)
    #print (gr.action.values)
    for i,v in enumerate(gr.action.values):
        #print (i)
        #print (v)
        #print(f_act[v])
        c_act[f_act[v]] += 1
        
    #print (c_act)
    _, c_act_uqc = np.unique(gr.action.values, return_counts=True)
    #print (c_act_uqc)
    c_act += [len(c_act_uqc), np.mean(c_act_uqc), np.std(c_act_uqc)]
    #print (c_act)
    l = l + c_act
    
    #action_detail features
    #(how many times each value occurs, numb of unique values, mean and std)
    c_act_detail = [0] * len(f_act_detail)
    for i,v in enumerate(gr.action_detail.values):
        c_act_detail[f_act_detail[v]] += 1 
    _, c_act_det_uqc = np.unique(gr.action_detail.values, return_counts=True)
    c_act_detail += [len(c_act_det_uqc), np.mean(c_act_det_uqc), np.std(c_act_det_uqc)]
    l = l + c_act_detail
    
    #action_type features
    #(how many times each value occurs, numb of unique values, mean and std
    #+ log of the sum of secs_elapsed for each value)
    l_act_type = [0] * len(f_act_type)
    c_act_type = [0] * len(f_act_type)
    for i,v in enumerate(gr.action_type.values):
        l_act_type[f_act_type[v]] += sev[i]   
        c_act_type[f_act_type[v]] += 1  
    l_act_type = np.log(1 + np.array(l_act_type)).tolist()
    #print(1 + np.array(l_act_type))
    #print(l_act_type)
    _, c_act_type_uqc = np.unique(gr.action_type.values, return_counts=True)
    #print(_)
    #print(c_act_type_uqc)
    c_act_type += [len(c_act_type_uqc), np.mean(c_act_type_uqc), np.std(c_act_type_uqc)]
    l = l + c_act_type + l_act_type    
    
    #device_type features
    #(how many times each value occurs, numb of unique values, mean and std)
    c_dev_type  = [0] * len(f_dev_type)
    for i,v in enumerate(gr.device_type .values):
        c_dev_type[f_dev_type[v]] += 1 
    c_dev_type.append(len(np.unique(gr.device_type.values)))
    _, c_dev_type_uqc = np.unique(gr.device_type.values, return_counts=True)
    c_dev_type += [len(c_dev_type_uqc), np.mean(c_dev_type_uqc), np.std(c_dev_type_uqc)]        
    l = l + c_dev_type    
    #print(l)
    #secs_elapsed features        
    l_secs = [0] * 5 
    l_log = [0] * 15
    if len(sev) > 0:
        #Simple statistics about the secs_elapsed values.
        l_secs[0] = np.log(1 + np.sum(sev))
        l_secs[1] = np.log(1 + np.mean(sev)) 
        l_secs[2] = np.log(1 + np.std(sev))
        l_secs[3] = np.log(1 + np.median(sev))
        l_secs[4] = l_secs[0] / float(l[1])
        
        #Values are grouped in 15 intervals. Compute the number of values
        #in each interval.
        log_sev = np.log(1 + sev).astype(int)
        l_log = np.bincount(log_sev, minlength=15).tolist()                      
    l = l + l_secs + l_log
    #print(l_secs)
    #print(l_log)
    #break
    
    #The list l has the feature values of one sample.
    samples.append(l)
    cont += 1

0 from 135483
10000 from 135483
20000 from 135483
30000 from 135483
40000 from 135483
50000 from 135483
60000 from 135483
70000 from 135483
80000 from 135483
90000 from 135483
100000 from 135483
110000 from 135483
120000 from 135483
130000 from 135483


In [89]:
l

['zzzlylp57e',
 40,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 4,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 2,
 0,
 1,
 4,
 4,
 9,
 14,
 2.8571428571428572,
 2.1665358411575859,
 0,
 0,
 

In [95]:
#Creating a dataframe with the computed features    
col_names = []    #name of the columns
for i in range(len(samples[0])-1):
    col_names.append('c_' + str(i)) 
#preparing objects   
print(samples)
samples = np.array(samples)
samp_ar = samples[:, 1:].astype(np.float16)
samp_id = samples[:, 0]   #The first element in obs is the id of the sample.
print(samp_ar)
print(samp_id)
#creating the dataframe        
df_agg_sess = pd.DataFrame(samp_ar, columns=col_names)
df_agg_sess['id'] = samp_id
df_agg_sess.index = df_agg_sess.id

[['00023iyk9l' '40' '0' ..., '0' '1' '0']
 ['0010k6l0om' '40' '0' ..., '0' '1' '0']
 ['001wyh0pz8' '40' '0' ..., '0' '1' '0']
 ..., 
 ['zzysuoqg6x' '40' '0' ..., '0' '1' '0']
 ['zzywmcn0jv' '40' '0' ..., '0' '1' '0']
 ['zzzlylp57e' '40' '0' ..., '0' '1' '0']]
[[ 40.   0.   0. ...,   0.   1.   0.]
 [ 40.   0.   0. ...,   0.   1.   0.]
 [ 40.   0.   0. ...,   0.   1.   0.]
 ..., 
 [ 40.   0.   0. ...,   0.   1.   0.]
 [ 40.   0.   0. ...,   0.   1.   0.]
 [ 40.   0.   0. ...,   0.   1.   0.]]
['00023iyk9l' '0010k6l0om' '001wyh0pz8' ..., 'zzysuoqg6x' 'zzywmcn0jv'
 'zzzlylp57e']


In [94]:
#########Working on train and test data#####################
print('Working on users data...')
#Concatenating df_train and df_test
df_tt = pd.concat((df_train, df_test), axis=0, ignore_index=True)
df_tt.index = df_tt.id
df_tt = df_tt.fillna(-1)  #Inputing this kind of missing value with -1 (missing values in train and test)
df_tt = df_tt.replace('-unknown-', -1) #-unknown is another way of missing value, then = -1.

########Creating features for train+test
#Removing date_first_booking
df_tt = df_tt.drop(['date_first_booking'], axis=1)

#Number of nulls
df_tt['n_null'] = np.array([sum(r == -1) for r in df_tt.values])

#date_account_created
#(Computing year, month, day, week_number, weekday)
df_tt.timestamp_first_active.astype(str)
dac = np.vstack(df_tt.date_account_created.astype(str).apply(lambda x: list(map(int, x.split('-')))).values)
df_tt['dac_y'] = dac[:,0]
df_tt['dac_m'] = dac[:,1]
df_tt['dac_d'] = dac[:,2]
dac_dates = [datetime(x[0],x[1],x[2]) for x in dac]
df_tt['dac_wn'] = np.array([d.isocalendar()[1] for d in dac_dates])
df_tt['dac_w'] = np.array([d.weekday() for d in dac_dates])
df_tt_wd = pd.get_dummies(df_tt.dac_w, prefix='dac_w')
df_tt = df_tt.drop(['date_account_created', 'dac_w'], axis=1)
df_tt = pd.concat((df_tt, df_tt_wd), axis=1)

#timestamp_first_active
#(Computing year, month, day, hour, week_number, weekday)
tfa = np.vstack(df_tt.timestamp_first_active.astype(str).apply(lambda x: list(map(int, [x[:4],x[4:6],x[6:8],x[8:10],x[10:12],x[12:14]]))).values)
df_tt['tfa_y'] = tfa[:,0]
df_tt['tfa_m'] = tfa[:,1]
df_tt['tfa_d'] = tfa[:,2]
df_tt['tfa_h'] = tfa[:,3]
tfa_dates = [datetime(x[0],x[1],x[2],x[3],x[4],x[5]) for x in tfa]
df_tt['tfa_wn'] = np.array([d.isocalendar()[1] for d in tfa_dates])
df_tt['tfa_w'] = np.array([d.weekday() for d in tfa_dates])
df_tt_wd = pd.get_dummies(df_tt.tfa_w, prefix='tfa_w')
df_tt = df_tt.drop(['timestamp_first_active', 'tfa_w'], axis=1)
df_tt = pd.concat((df_tt, df_tt_wd), axis=1)

#timespans between dates
#(Computing absolute number of seconds of difference between dates, sign of the difference)
df_tt['dac_tfa_secs'] = np.array([np.log(1+abs((dac_dates[i]-tfa_dates[i]).total_seconds())) for i in range(len(dac_dates))])
df_tt['sig_dac_tfa'] = np.array([np.sign((dac_dates[i]-tfa_dates[i]).total_seconds()) for i in range(len(dac_dates))])
#    df_tt['dac_tfa_days'] = np.array([np.sign((dac_dates[i]-tfa_dates[i]).days) for i in range(len(dac_dates))])

#Comptute seasons from dates
#(Computing the season for the two dates)
Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [(0, (date(Y,  1,  1),  date(Y,  3, 20))),  #'winter'
           (1, (date(Y,  3, 21),  date(Y,  6, 20))),  #'spring'
           (2, (date(Y,  6, 21),  date(Y,  9, 22))),  #'summer'
           (3, (date(Y,  9, 23),  date(Y, 12, 20))),  #'autumn'
           (0, (date(Y, 12, 21),  date(Y, 12, 31)))]  #'winter'
def get_season(dt):
    dt = dt.date()
    dt = dt.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= dt <= end)
df_tt['season_dac'] = np.array([get_season(dt) for dt in dac_dates])
df_tt['season_tfa'] = np.array([get_season(dt) for dt in tfa_dates])
#df_all['season_dfb'] = np.array([get_season(dt) for dt in dfb_dates])

#Age
#(Keeping ages in 14 < age < 99 as OK and grouping others according different kinds of mistakes)
av = df_tt.age.values
av = np.where(np.logical_and(av<1998, av>1914), 2014-av, av) #This are birthdays instead of age (estimating age by doing 2014 - value)
av = np.where(np.logical_and(av<16, av>0), 4, av) #Using specific value=4 for age values below 14
av = np.where(np.logical_and(av<2017, av>2010), 9, av) #This is the current year insted of age (using specific value = 9)
av = np.where(av > 100, 110, av)  #Using specific value=110 for age values above 99
df_tt['age'] = av

#AgeRange
#(One-hot encoding of the edge according these intervals)
interv =  [0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 100]
def get_interv_value(age):
    iv = 20
    for i in range(len(interv)):
        if age < interv[i]:
            iv = i 
            break
    return iv
df_tt['age_interv'] = df_tt.age.apply(lambda x: get_interv_value(x))
df_tt_ai = pd.get_dummies(df_tt.age_interv, prefix='age_interv')
df_tt = df_tt.drop(['age_interv'], axis=1)
df_tt = pd.concat((df_tt, df_tt_ai), axis=1)

#One-hot-encoding features
ohe_feats = ['gender', 'signup_method', 'signup_flow', 'language', 'affiliate_channel', 'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']
for f in ohe_feats:
    df_tt_dummy = pd.get_dummies(df_tt[f], prefix=f)
    df_tt = df_tt.drop([f], axis=1)
    df_tt = pd.concat((df_tt, df_tt_dummy), axis=1)    
   
######Merging train-test with session data#################
df_all = pd.merge(df_tt, df_agg_sess, how='left')
df_all = df_all.drop(['id'], axis=1)
df_all = df_all.fillna(-2)  #Missing features for samples without sesssion data.
#All types of null 
df_all['all_null'] = np.array([sum(r<0) for r in df_all.values])


######Computing X, y and X_test ################
piv_train = len(target) #Marker to split df_all into train + test
vals = df_all.values
le = LabelEncoder()

X = vals[:piv_train]
y = le.fit_transform(target.values)
X_test = vals[piv_train:]
print('Shape X = %s, Shape X_test = %s'%(X.shape, X_test.shape))


Working on users data...
Shape X = (213451, 661), Shape X_test = (62096, 661)


In [156]:
np.unique(target.values)

array(['AU', 'CA', 'DE', 'ES', 'FR', 'GB', 'IT', 'NDF', 'NL', 'PT', 'US',
       'other'], dtype=object)

In [157]:
target.values

array(['NDF', 'NDF', 'US', ..., 'NDF', 'NDF', 'NDF'], dtype=object)

In [160]:
len(np.unique(y))

12

In [104]:
print (len(y))
print (len(X))

213451
213451


In [97]:
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from xgboost.sklearn import XGBClassifier

In [98]:
clf = LogisticRegression()
clf.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [106]:
yv = clf.predict(X_test)
clf.score(X,y)

0.6302664311715569

In [100]:
yv

array([7, 7, 7, ..., 7, 7, 7])

In [101]:
X

array([[ -1.00000000e+00,   2.00000000e+00,   2.01000000e+03, ...,
         -2.00000000e+00,  -2.00000000e+00,   4.58000000e+02],
       [  3.80000000e+01,   0.00000000e+00,   2.01100000e+03, ...,
         -2.00000000e+00,  -2.00000000e+00,   4.57000000e+02],
       [  5.60000000e+01,   0.00000000e+00,   2.01000000e+03, ...,
         -2.00000000e+00,  -2.00000000e+00,   4.57000000e+02],
       ..., 
       [  3.20000000e+01,   1.00000000e+00,   2.01400000e+03, ...,
          1.00000000e+00,   0.00000000e+00,   1.00000000e+00],
       [ -1.00000000e+00,   2.00000000e+00,   2.01400000e+03, ...,
          1.00000000e+00,   0.00000000e+00,   2.00000000e+00],
       [ -1.00000000e+00,   3.00000000e+00,   2.01400000e+03, ...,
          1.00000000e+00,   0.00000000e+00,   2.00000000e+00]])

In [147]:
y

array([ 7,  7, 10, ...,  7,  7,  7])

In [107]:
from sklearn.model_selection import GridSearchCV

In [110]:
param = {
    "C":(1, 0.75, 0.5, 0.25, 0.1),
    "penalty":("l1", "l2")
}
gd = GridSearchCV(clf, param)

In [None]:
gd.fit(X, y)

In [116]:
print('Best score for training data:', gd.best_score_)
print('Best `C`:',gd.best_estimator_.C)
#print('Best kernel:',gd.best_estimator_.kernel)
#print('Best `gamma`:',gd.best_estimator_.gamma)

Best score for training data: 0.597345526608
Best `C`: 0.75


In [124]:
from xgboost.sklearn import XGBClassifier


In [144]:
import xgboost as xgb
def modelfit(alg, dtrain, target, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=target)
        xgb_param['num_class'] = len(np.unique(target))
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(dtrain, target, eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dtrain_predprob = alg.predict_proba(dtrain)[:,1]

    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain.values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain, dtrain_predprob))

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [145]:
clf_xgb = XGBClassifier(learning_rate =0.1,\
                        n_estimators=1000,\
                        max_depth=5,\
                        min_child_weight=1,\
                        gamma=0,\
                        subsample=0.8,\
                        colsample_bytree=0.8,\
                        objective="multi:softprob",\
                        nthread=4,scale_pos_weight=1,\
                        seed=27)
#modelfit(clf_xgb, train, predictors)

In [146]:
modelfit(clf_xgb, X, y)

TypeError: 'tuple' object is not callable