In [1]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.cross_validation import StratifiedKFold,KFold,train_test_split
from scipy.stats import randint, uniform
from sklearn.metrics import roc_auc_score

np.random.seed(22)
import datetime
import random
from operator import itemgetter
import time
import copy

In [4]:
dtrain = xgb.DMatrix('to_r_n_back/dtrain.data')
dval = xgb.DMatrix('to_r_n_back/dtest.data')
yval = (pd.read_csv('to_r_n_back/val1_target.csv')).outcome.values
labels = yval
lbl_enc = preprocessing.LabelEncoder()
labels = lbl_enc.fit_transform(labels)
dval.set_label(labels)

In [12]:
param = {'objective': 'binary:logistic', 
         'max_depth': 11, 
         'gamma': 0.038587401190034704, 
         'eval_metric': 'auc', 
         'colsample_bylevel': 0.40883831209377614, 
         'min_child_weight': 7, 
         'lambda': 3.480389590147552, 
         'n_estimators': 100000, 
         'colsample_bytree': 0.26928766415604755, 
         'seed': 5, 
         'alpha': 0.7707414382224765, 
         'nthread': 4, 
         'silent': 1, 
         'subsample': 0.5447189256867526, 
         'eta': 0.05}

In [13]:
evals  = [(dtrain, 'train'),(dval, 'eval')]
num_round = 100000
bst = xgb.train(param, dtrain, num_round, early_stopping_rounds=50, evals=evals, verbose_eval=10)

[0]	train-auc:0.923093	eval-auc:0.522593
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 50 rounds.
[10]	train-auc:0.950549	eval-auc:0.628519
[20]	train-auc:0.955481	eval-auc:0.636656
[30]	train-auc:0.959203	eval-auc:0.648139
[40]	train-auc:0.96225	eval-auc:0.656799
[50]	train-auc:0.964075	eval-auc:0.661581
[60]	train-auc:0.965645	eval-auc:0.665197
[70]	train-auc:0.96736	eval-auc:0.67181
[80]	train-auc:0.968411	eval-auc:0.676265
[90]	train-auc:0.969806	eval-auc:0.677998
[100]	train-auc:0.970716	eval-auc:0.67919
[110]	train-auc:0.971355	eval-auc:0.680881
[120]	train-auc:0.972046	eval-auc:0.682143
[130]	train-auc:0.972231	eval-auc:0.686029
[140]	train-auc:0.972568	eval-auc:0.685422
[150]	train-auc:0.972823	eval-auc:0.686412
[160]	train-auc:0.973142	eval-auc:0.686651
[170]	train-auc:0.97342	eval-auc:0.68611
[180]	train-auc:0.973651	eval-auc:0.686501
[190]	train-auc:0.973832	eval-auc:0.687325
[200]	train-auc:

In [2]:
dtrain = xgb.DMatrix('svmlight_try2/dtrain.data')
dtest = xgb.DMatrix('svmlight_try2/dtest.data')

In [3]:
act_test_data  = pd.read_csv("redhat_data_new/act_test_new_try2.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
df1 = pd.read_csv('redhat_data_new/Submission_leak_happycube_python.csv')

In [4]:
c = list(set(act_test_data.activity_id.unique())&set(df1.activity_id.unique()))
len(c)

82155

In [5]:
ac = df1.loc[df1['activity_id'].isin(c)]
ad = df1.loc[~df1['activity_id'].isin(c)]
ac.shape,ad.shape

((82155, 2), (416532, 2))

In [6]:
ae = ac[(ac.outcome==1)|(ac.outcome==0)]

In [7]:
d = list(set(act_test_data.activity_id.unique())&set(ae.activity_id.unique()))
len(d)

8730

In [8]:
af = act_test_data.loc[act_test_data['activity_id'].isin(d)]
af.shape

(8730, 14)

In [9]:
indx = af.index

In [10]:
ae.index = ae.activity_id.values

In [11]:
ae.head()

Unnamed: 0,activity_id,outcome
act2_1012472,act2_1012472,0.0
act2_1058853,act2_1058853,1.0
act2_1169467,act2_1169467,1.0
act2_1661559,act2_1661559,1.0
act2_1832319,act2_1832319,1.0


In [12]:
af.index = af.activity_id.values
af.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
act2_4091938,ppl_101994,act2_4091938,2023-01-13,type 5,,,,,,,,,,type 2290
act2_439818,ppl_101994,act2_439818,2023-01-03,type 5,,,,,,,,,,type 2290
act2_447050,ppl_101994,act2_447050,2023-01-01,type 5,,,,,,,,,,type 2290
act2_4474309,ppl_101994,act2_4474309,2023-01-03,type 3,,,,,,,,,,type 835
act2_4478764,ppl_101994,act2_4478764,2023-08-18,type 4,,,,,,,,,,type 633


In [13]:
ag = ae.ix[af.index]
ag.head()

Unnamed: 0,activity_id,outcome
act2_4091938,act2_4091938,0.0
act2_439818,act2_439818,1.0
act2_447050,act2_447050,1.0
act2_4474309,act2_4474309,1.0
act2_4478764,act2_4478764,1.0


In [14]:
ag.index = indx
ag.head()

Unnamed: 0,activity_id,outcome
451,act2_4091938,0.0
456,act2_439818,1.0
457,act2_447050,1.0
458,act2_4474309,1.0
459,act2_4478764,1.0


In [15]:
ae.reset_index(drop=True,inplace=True)

In [16]:
act_test_data.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10
0,ppl_100179,act2_2277070,2022-09-02,type 3,,,,,,,,,,type 23
1,ppl_100179,act2_3152044,2022-09-02,type 2,,,,,,,,,,type 1
2,ppl_100322,act2_927116,2023-01-24,type 2,,,,,,,,,,type 1
3,ppl_100322,act2_534352,2023-01-24,type 2,,,,,,,,,,type 1
4,ppl_100322,act2_471550,2023-01-11,type 2,,,,,,,,,,type 1


In [17]:
dtest.slice(indx).get_label()

array([], dtype=float32)

In [20]:
param1 = {'objective': 'binary:logistic', 
         'booster': 'gbtree',
         'max_depth': 11, 
         'gamma': 0.038587401190034704, 
         'eval_metric': 'auc', 
         'colsample_bylevel': 0.40883831209377614, 
         'min_child_weight': 7, 
         'lambda': 3.480389590147552, 
         'n_estimators': 100000, 
         'colsample_bytree': 0.26928766415604755, 
         'seed': 5, 
         'alpha': 0.7707414382224765, 
         'nthread': 20, 
         'silent': 1, 
         'subsample': 0.5447189256867526, 
         'eta': 0.3}
dval = dtest.slice(indx)
dval.set_label(ag.outcome.values)
evals  = [(dtrain, 'train'),(dval, 'eval')]
num_round = 200000
bst = xgb.train(param1, dtrain, num_round, early_stopping_rounds=200, evals=evals, verbose_eval=10)

[0]	train-auc:0.908158	eval-auc:0.505321
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[10]	train-auc:0.957492	eval-auc:0.571991
[20]	train-auc:0.965017	eval-auc:0.580533
[30]	train-auc:0.967387	eval-auc:0.592774
[40]	train-auc:0.968815	eval-auc:0.613871
[50]	train-auc:0.969848	eval-auc:0.613001
[60]	train-auc:0.970632	eval-auc:0.616749
[70]	train-auc:0.971198	eval-auc:0.61235
[80]	train-auc:0.971735	eval-auc:0.613916
[90]	train-auc:0.972309	eval-auc:0.619753
[100]	train-auc:0.972774	eval-auc:0.623228
[110]	train-auc:0.973174	eval-auc:0.628667
[120]	train-auc:0.973567	eval-auc:0.63815
[130]	train-auc:0.974325	eval-auc:0.641739
[140]	train-auc:0.97466	eval-auc:0.643106
[150]	train-auc:0.974987	eval-auc:0.641661
[160]	train-auc:0.975269	eval-auc:0.644565
[170]	train-auc:0.97589	eval-auc:0.649404
[180]	train-auc:0.97611	eval-auc:0.653021
[190]	train-auc:0.9764	eval-auc:0.652408
[200]	train-auc:

In [22]:
ypred = bst.predict(dtest, ntree_limit=bst.best_ntree_limit)

In [25]:
act_test_data1  = pd.read_csv("redhat_data_new/act_test_new_try2.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])

In [31]:
output = pd.DataFrame({ 'activity_id' : act_test_data1['activity_id'], 'outcome': ypred })
output.head()
output.to_csv('model_sub_81k_try2.csv', index = False)