In [1]:
import numpy as np 
import pandas as pd
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.cross_validation import StratifiedKFold,KFold,train_test_split
from scipy.stats import randint, uniform
from sklearn.metrics import roc_auc_score

import datetime
import random
from operator import itemgetter
import time
import copy

In [2]:
def reduce_dimen(dataset,column,toreplace):
    for index,i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset
    
def act_data_treatment(dsname):
    dataset = dsname
    
    for col in list(dataset.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if dataset[col].dtype == 'object':
                dataset[col].fillna('type 0', inplace=True)
                dataset[col] = dataset[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif dataset[col].dtype == 'bool':
                dataset[col] = dataset[col].astype(np.int8)
    
    #dataset['year'] = dataset['date'].dt.year
    #dataset['month'] = dataset['date'].dt.month
    #dataset['day'] = dataset['date'].dt.day
    #dataset['isweekend'] = (dataset['date'].dt.weekday >= 5).astype(int)
    dataset = dataset.drop('date', axis = 1)
    
    return dataset

In [3]:
act_train_data = pd.read_csv("input/act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("input/act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("input/people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

act_train_data=act_train_data.drop('char_10',axis=1)
act_test_data=act_test_data.drop('char_10',axis=1)

print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

act_train_data  = act_data_treatment(act_train_data)
act_test_data   = act_data_treatment(act_test_data)
people_data = act_data_treatment(people_data)

train = act_train_data.merge(people_data, on='people_id', how='left', left_index=True)
test  = act_test_data.merge(people_data, on='people_id', how='left', left_index=True)

del act_train_data
del act_test_data
del people_data

train=train.sort_values(['people_id'], ascending=[1])
test=test.sort_values(['people_id'], ascending=[1])

train_columns = train.columns.values
test_columns = test.columns.values
features = list(set(train_columns) & set(test_columns))

train.fillna('NA', inplace=True)
test.fillna('NA', inplace=True)

y = train.outcome
train=train.drop('outcome',axis=1)

whole=pd.concat([train,test],ignore_index=True)
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)
    
X=whole[:len(train)]
X_test=whole[len(train):]

del train
del whole
    
X=X.sort_values(['people_id'], ascending=[1])

X = X[features].drop(['people_id', 'activity_id', 'char_1_y'], axis = 1)
X_test = X_test[features].drop(['people_id', 'activity_id', 'char_1_y'], axis = 1)

#list categorical features and label them
from sklearn.preprocessing import LabelEncoder
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
not_categorical=[]
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)
    else:
        temp = pd.concat([X[category],X_test[category]])
        le = LabelEncoder()
        le.fit(temp.values)
        X[category] = le.transform(X[category].values)
        X_test[category] = le.transform(X_test[category].values) 

enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([X[categorical],X_test[categorical]]))
X_cat_sparse=enc.transform(X[categorical])
X_test_cat_sparse=enc.transform(X_test[categorical])

Train data shape: (2197291, 14)
Test data shape: (498687, 13)
People data shape: (189118, 41)


In [4]:
X[not_categorical].shape, X[categorical].shape, X_test[not_categorical].shape, X_test[categorical].shape

((2197291, 29), (2197291, 19), (498687, 29), (498687, 19))

In [5]:
from scipy.sparse import hstack
X_sparse=hstack((X[not_categorical], X_cat_sparse))
X_test_sparse=hstack((X_test[not_categorical], X_test_cat_sparse))

print("Training data: " + format(X_sparse.shape))
print("Test data: " + format(X_test_sparse.shape))
print("###########")
print("One Hot enconded Test Dataset Script")

Training data: (2197291, 31262)
Test data: (498687, 31262)
###########
One Hot enconded Test Dataset Script


In [7]:
dtrain = xgb.DMatrix('input/svmlight_raddar/dtrain.data')
dtest = xgb.DMatrix('input/svmlight_raddar/dtest.data')

In [13]:
dtrain1 = xgb.DMatrix(X_sparse,label=y)
dtest1 = xgb.DMatrix(X_test_sparse)

In [24]:
dtest.num_col()

14140

In [20]:
param = {'max_depth':10, 'eta':0.02, 'silent':0, 'objective':'binary:logistic' }
#param['nthread'] = 2
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['booster'] = "gblinear"

In [21]:
del X_sparse
del X_test_sparse

In [23]:
np.random.seed(120)
evals  = [(dtrain,'train')]
num_round = 305
bst = xgb.train(param, dtrain, num_round, evals, verbose_eval=10)

[0]	train-auc:0.878421
[10]	train-auc:0.947333
[20]	train-auc:0.97316
[30]	train-auc:0.981495
[40]	train-auc:0.984999
[50]	train-auc:0.986683
[60]	train-auc:0.987606
[70]	train-auc:0.988173
[80]	train-auc:0.988551
[90]	train-auc:0.988826
[100]	train-auc:0.989029
[110]	train-auc:0.989185
[120]	train-auc:0.989307
[130]	train-auc:0.989399
[140]	train-auc:0.989472
[150]	train-auc:0.989533
[160]	train-auc:0.989588
[170]	train-auc:0.989633
[180]	train-auc:0.989672
[190]	train-auc:0.989707
[200]	train-auc:0.989737
[210]	train-auc:0.989764
[220]	train-auc:0.989787
[230]	train-auc:0.989808
[240]	train-auc:0.989827
[250]	train-auc:0.989845
[260]	train-auc:0.98986
[270]	train-auc:0.989874
[280]	train-auc:0.989887
[290]	train-auc:0.989897
[300]	train-auc:0.989904


In [25]:
ypred = bst.predict(dtest)
output = pd.DataFrame({ 'activity_id' : test['activity_id'], 'outcome': ypred })
output.head()
output.to_csv('without_leak.csv', index = False)
!zip subb1.zip without_leak.csv

  adding: without_leak.csv (deflated 77%)


In [26]:
dff = pd.read_csv('input/Submission_leak_python.csv')

In [56]:
x = dff[(dff.outcome==0)|(dff.outcome==1)]

In [33]:
dff.shape

(498687, 2)

In [34]:
498687-384061

114626

In [45]:
y.sum()

975497

In [52]:
y.sum()/y.shape[0]

0.44395439657287089

In [50]:
y.shape

(2197291,)

In [54]:
2197291/2

1098645.5

In [58]:
x.outcome.sum()

187773.0

In [61]:
x.outcome.sum()/x.shape[0]

0.48891452139113317

In [64]:
dff.head()

Unnamed: 0,activity_id,outcome
0,act1_249281,0.0
1,act2_230855,0.0
2,act1_240724,1.0
3,act1_83552,1.0
4,act2_1043301,1.0


In [67]:
dff[(dff.outcome==0)|(dff.outcome>0.85)].shape

(413196, 2)

In [103]:
dff.shape[0]-dff[(dff.outcome<0.1)|(dff.outcome>0.9)].shape[0]

71647

In [104]:
dff[(dff.outcome==0)|(dff.outcome==1)].shape[0]

384061

In [110]:
dff.shape[0]-x.shape[0]

114626

In [106]:
x.head()

Unnamed: 0,activity_id,outcome
0,act1_249281,0.0
1,act2_230855,0.0
2,act1_240724,1.0
3,act1_83552,1.0
4,act2_1043301,1.0


In [111]:
new_test = list(set(dff.activity_id.unique())-set(x.activity_id.unique()))

In [113]:
len(new_test)

114626

In [115]:
subb2 = pd.concat([pd.Series(new_test), pd.Series([0.5]*114626)], axis=1)

In [118]:
subb2.columns = ['activity_id', 'outcome']

In [119]:
subb2.head()

Unnamed: 0,activity_id,outcome
0,act2_2480443,0.5
1,act2_1981398,0.5
2,act2_1370557,0.5
3,act2_1950331,0.5
4,act2_587795,0.5


In [120]:
fin_sub_df = x.append(subb2, ignore_index=True)

In [128]:
dff.index = dff.activity_id.values

In [132]:
dff.shape

(498687, 2)

In [133]:
fin_sub_df.index = fin_sub_df.activity_id.values

In [135]:
fin_sub_df.tail()

Unnamed: 0,activity_id,outcome
act2_2703252,act2_2703252,0.5
act2_1150191,act2_1150191,0.5
act2_3564349,act2_3564349,0.5
act2_4793812,act2_4793812,0.5
act2_4040530,act2_4040530,0.5


In [136]:
dff.tail()

Unnamed: 0,activity_id,outcome
act2_4367092,act2_4367092,0.0
act2_4404220,act2_4404220,0.0
act2_448830,act2_448830,0.0
act2_450133,act2_450133,0.0
act2_847967,act2_847967,0.0


In [138]:
y = fin_sub_df.ix[dff.index]

In [139]:
y.tail()

Unnamed: 0,activity_id,outcome
act2_4367092,act2_4367092,0.0
act2_4404220,act2_4404220,0.0
act2_448830,act2_448830,0.0
act2_450133,act2_450133,0.0
act2_847967,act2_847967,0.0


In [140]:
y.shape

(498687, 2)

In [142]:
y.to_csv('sub_leak_test_0-1.csv', index=False)

In [143]:
xxx = pd.read_csv('sub_leak_test_0-1.csv')

In [146]:
xxx.head()

Unnamed: 0,activity_id,outcome
0,act1_249281,0.0
1,act2_230855,0.0
2,act1_240724,1.0
3,act1_83552,1.0
4,act2_1043301,1.0


In [150]:
xxx[xxx.outcome==0.5].shape

(114626, 2)

In [151]:
384061+114626

498687

In [152]:
!zip sub_leak_test.zip sub_leak_test_0-1.csv

  adding: sub_leak_test_0-1.csv (deflated 73%)


In [153]:
384061/498687

0.770144399192279

In [155]:
187767,33600,81026

(187767, 33600, 81026)

In [156]:
df1 = pd.read_csv('input/Submission_leak_loisso.csv')

In [158]:
df1.head()

Unnamed: 0,activity_id,outcome
0,act1_74825,1.0
1,act2_3763192,1.0
2,act2_1526630,1.0
3,act2_1814985,1.0
4,act2_1907844,1.0


In [214]:
df1[(df1.outcome==0)|(df1.outcome==1)].shape[0]

384061

In [213]:
dff[(dff.outcome>0.49)&(dff.outcome<0.51)].shape[0]

69129

In [184]:
xx = dff[(dff.outcome>0.49)&(dff.outcome<0.51)]

In [185]:
xy = df1[(df1.outcome>0.49)&(df1.outcome<0.51)]

In [186]:
xx.shape

(69129, 2)

In [187]:
xy.shape

(72331, 2)

In [188]:
dtrain.num_row()

2197291

In [193]:
2197291/384061

5.7212031422091805

In [206]:
xy.outcome.unique()

array([ 0.50566869,  0.5       ])

In [215]:
act_train_data = pd.read_csv("input/act_train.csv",dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("input/act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data    = pd.read_csv("input/people.csv", dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32}, parse_dates=['date'])

In [217]:
act_train_data.head()

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
0,ppl_100,act2_1734928,2023-08-26,type 4,,,,,,,,,,type 76,0
1,ppl_100,act2_2434093,2022-09-27,type 2,,,,,,,,,,type 1,0
2,ppl_100,act2_3404049,2022-09-27,type 2,,,,,,,,,,type 1,0
3,ppl_100,act2_3651215,2023-08-04,type 2,,,,,,,,,,type 1,0
4,ppl_100,act2_4109017,2023-08-26,type 2,,,,,,,,,,type 1,0


In [245]:
act_test_data.tail(5)

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
498682,ppl_99997,act2_4367092,2023-04-22,type 2,,,,,,,,,,type 1,0.0
498683,ppl_99997,act2_4404220,2022-11-12,type 2,,,,,,,,,,type 1,0.0
498684,ppl_99997,act2_448830,2022-08-02,type 2,,,,,,,,,,type 1,0.0
498685,ppl_99997,act2_450133,2022-08-02,type 2,,,,,,,,,,type 1,0.0
498686,ppl_99997,act2_847967,2022-10-15,type 2,,,,,,,,,,type 1,0.0


In [223]:
xxx[(xxx.outcome==0)|(xxx.outcome==1)].shape

(384061, 2)

In [244]:
xxx.tail(5)

Unnamed: 0,activity_id,outcome
498682,act2_4367092,0.0
498683,act2_4404220,0.0
498684,act2_448830,0.0
498685,act2_450133,0.0
498686,act2_847967,0.0


In [237]:
act_test_data['outcome'] = xxx['outcome']

In [248]:
ab = act_test_data[(act_test_data.outcome==0)|(act_test_data.outcome==1)]
aa = act_test_data[(act_test_data.outcome>0)&(act_test_data.outcome<1)]

In [249]:
ab.shape

(384061, 15)

In [250]:
aa.shape

(114626, 15)

In [251]:
tr = act_train_data.append(ab, ignore_index=True)

In [264]:
tr.tail(6)

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9,char_10,outcome
2581346,ppl_99997,act2_4355610,2023-06-10,type 2,,,,,,,,,,type 1,0.0
2581347,ppl_99997,act2_4367092,2023-04-22,type 2,,,,,,,,,,type 1,0.0
2581348,ppl_99997,act2_4404220,2022-11-12,type 2,,,,,,,,,,type 1,0.0
2581349,ppl_99997,act2_448830,2022-08-02,type 2,,,,,,,,,,type 1,0.0
2581350,ppl_99997,act2_450133,2022-08-02,type 2,,,,,,,,,,type 1,0.0
2581351,ppl_99997,act2_847967,2022-10-15,type 2,,,,,,,,,,type 1,0.0


In [265]:
trainn = tr.sort_values(by='people_id')

In [267]:
trainn.outcome.value_counts()

0.0    1418082
1.0    1163270
Name: outcome, dtype: int64

In [269]:
trainn.to_csv('act_train_new.csv', index=False)

In [272]:
aa.drop(['outcome'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [274]:
aa.sort_values(by='people_id',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [277]:
aa.to_csv('act_test_new.csv', index=False)