In [1]:
import pandas as pd
import csv

act_test = pd.read_csv('./data/act_test.csv', parse_dates=['date'])
act_train = pd.read_csv('./data/act_train.csv', parse_dates=['date'])
ppl = pd.read_csv('./data/people.csv', parse_dates=['date'])

df_train = pd.merge(act_train, ppl, on='people_id', suffixes=('_act', '_ppl'))
act_test['dummy']=0
df_test = pd.merge(act_test, ppl, on='people_id', suffixes=('_act', '_ppl'))
labels = df_train['outcome']

In [2]:
for table in [df_train]:
        
    table.insert(3, 'year_act', table['date_act'].dt.year)
    table.insert(4, 'month_act',table['date_act'].dt.month)
    table.insert(5, 'day_act', table['date_act'].dt.day)
    table.insert(6, 'weekday_act', table['date_act'].dt.weekday)
  
    table.insert(19, 'year_ppl', table['date_ppl'].dt.year)
    table.insert(20, 'month_ppl', table['date_ppl'].dt.month)
    table.insert(21, 'day_ppl', table['date_ppl'].dt.day)
    table.insert(22, 'weekday_ppl', table['date_ppl'].dt.weekday)
    
    table['from_join_to_act'] = (table['date_act'] - table['date_ppl']).astype('timedelta64[D]')
    
    del table['date_act']
    del table['date_ppl']

In [3]:
for table in [df_test]:
        
    table.insert(3, 'year_act', table['date_act'].dt.year)
    table.insert(4, 'month_act',table['date_act'].dt.month)
    table.insert(5, 'day_act', table['date_act'].dt.day)
    table.insert(6, 'weekday_act', table['date_act'].dt.weekday)
  
    table.insert(19, 'year_ppl', table['date_ppl'].dt.year)
    table.insert(20, 'month_ppl', table['date_ppl'].dt.month)
    table.insert(21, 'day_ppl', table['date_ppl'].dt.day)
    table.insert(22, 'weekday_ppl', table['date_ppl'].dt.weekday)
    
    table['from_join_to_act'] = (table['date_act'] - table['date_ppl']).astype('timedelta64[D]')
    
    del table['date_act']
    del table['date_ppl']

In [4]:
def preprocess_features(data, train_set=True, test_set=True):
    
    data = data.drop(['activity_id'], axis=1)
    data['people_id'] = data['people_id'].apply(lambda x: x.split('_')[1])
    data['people_id'] = pd.to_numeric(data['people_id']).astype(int)
    
    if(train_set):
        data = data.drop(['outcome'], axis=1)
        
    else:
        data = data.drop(['dummy'], axis=1)
    
    columns = list(data.columns)
    bools = columns[30:58]
    strings_act = columns[5:16]
    strings_ppl = columns[20:30]
    
    for col in bools:
        data[col] = pd.to_numeric(data[col]).astype(int)

    for col in strings_act:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col].astype(int))
        
    for col in strings_ppl:
        data[col] = data[col].fillna('type 0')
        data[col] = data[col].apply(lambda x: x.split(' ')[1])
        data[col] = pd.to_numeric(data[col].astype(int))
        
    return data

In [5]:
feature_train = preprocess_features(df_train)
feature_test = preprocess_features(df_test, train_set=False)

In [6]:
# reference: https://www.kaggle.com/anokas/redhatxgb-bb505a62

from sklearn.cross_validation import LabelKFold
import numpy as np

train_mask, valid_mask = list(LabelKFold(feature_train['people_id'], n_folds=10))[0]

feature_test = feature_test.drop(['people_id'],axis=1)
feature_train = feature_train.drop(['people_id'], axis=1)

x_train = np.array(feature_train)[train_mask]
y_train = np.array(labels)[train_mask]

x_valid = np.array(feature_train)[valid_mask]
y_valid = np.array(labels)[valid_mask]



In [7]:
# parameter tuning on XGBoost
# reference : https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['eta'] = 0.1
params['max_depth'] = 20
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9

In [8]:
import xgboost as xgb

d_train = xgb.DMatrix(x_train, label=y_train)
d_valid = xgb.DMatrix(x_valid, label=y_valid)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

In [9]:
%%time

# reference : http://xgboost.readthedocs.io/en/latest/python/python_api.html

clf = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=40)

[0]	train-auc:0.993351	valid-auc:0.964933
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 40 rounds.
[1]	train-auc:0.99672	valid-auc:0.967792
[2]	train-auc:0.997698	valid-auc:0.969695
[3]	train-auc:0.998331	valid-auc:0.971227
[4]	train-auc:0.998782	valid-auc:0.972516
[5]	train-auc:0.998955	valid-auc:0.973261
[6]	train-auc:0.999041	valid-auc:0.974122
[7]	train-auc:0.9991	valid-auc:0.974395
[8]	train-auc:0.999262	valid-auc:0.974711
[9]	train-auc:0.999354	valid-auc:0.975073
[10]	train-auc:0.999419	valid-auc:0.975291
[11]	train-auc:0.999502	valid-auc:0.975249
[12]	train-auc:0.999539	valid-auc:0.975784
[13]	train-auc:0.99958	valid-auc:0.976311
[14]	train-auc:0.999624	valid-auc:0.976443
[15]	train-auc:0.999667	valid-auc:0.977109
[16]	train-auc:0.999702	valid-auc:0.97755
[17]	train-auc:0.999726	valid-auc:0.977908
[18]	train-auc:0.999752	valid-auc:0.978182
[19]	train-auc:0.999773	valid-auc:0.97838
[20]	train-au

[198]	train-auc:1	valid-auc:0.982165
[199]	train-auc:1	valid-auc:0.982197
[200]	train-auc:1	valid-auc:0.982185
[201]	train-auc:1	valid-auc:0.982186
[202]	train-auc:1	valid-auc:0.982193
[203]	train-auc:1	valid-auc:0.982207
[204]	train-auc:1	valid-auc:0.982211
[205]	train-auc:1	valid-auc:0.98222
[206]	train-auc:1	valid-auc:0.982216
[207]	train-auc:1	valid-auc:0.982235
[208]	train-auc:1	valid-auc:0.982225
[209]	train-auc:1	valid-auc:0.982216
[210]	train-auc:1	valid-auc:0.982212
[211]	train-auc:1	valid-auc:0.982199
[212]	train-auc:1	valid-auc:0.982191
[213]	train-auc:1	valid-auc:0.982197
[214]	train-auc:1	valid-auc:0.982202
[215]	train-auc:1	valid-auc:0.982203
[216]	train-auc:1	valid-auc:0.982206
[217]	train-auc:1	valid-auc:0.982214
[218]	train-auc:1	valid-auc:0.982206
[219]	train-auc:1	valid-auc:0.98221
[220]	train-auc:1	valid-auc:0.982206
[221]	train-auc:1	valid-auc:0.982217
[222]	train-auc:1	valid-auc:0.982219
[223]	train-auc:1	valid-auc:0.98222
[224]	train-auc:1	valid-auc:0.982222
[225

In [10]:
# make an outcome to submit
p_test = clf.predict(xgb.DMatrix(np.array(feature_test)))

sub = pd.DataFrame()
sub['activity_id'] = df_test['activity_id']
sub['outcome'] = p_test
sub.to_csv('submit.csv', index=False)

In [14]:
importance = clf.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()

plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('feature_importance_xgb.png')

NameError: name 'operator' is not defined

In [12]:
fscore = clf.best_estimator_.booster().get_fscore()

AttributeError: 'Booster' object has no attribute 'best_estimator_'

In [11]:
print(clf.feature_importances_)

AttributeError: 'Booster' object has no attribute 'feature_importances_'

In [None]:
'''Feature importance analysis code'''
'''Reference: Python Machine Learning'''
feat_labels = np.array(dfX6.columns)
forest = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs = 3)
forest.fit(dfX_train, dfY_train)

importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
indices = indices[:30]

for f in range(len(indices)):
    print("%2d %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]] ))

plt.figure(figsize = (9, 7))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices])
plt.xticks(range(len(indices)), feat_labels[indices], rotation=90)
plt.tight_layout()
plt.show()