In [60]:
import pickle


import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
%matplotlib inline

### Load data

In [25]:
people = pd.read_csv('./data_ori/people.csv', 
                     parse_dates=['date'], 
                     dtype={'people_id': np.str, 'activity_id': np.str, 'char_38': np.int32})
train = pd.read_csv('./data_ori/act_train.csv', 
                    parse_dates=['date'], 
                    dtype={'people_id': np.str, 'activity_id': np.str, 'outcome': np.int8})
test = pd.read_csv('./data_ori/act_test.csv', 
                   parse_dates=['date'], 
                   dtype={'people_id': np.str, 'activity_id': np.str})

### Preprocess data

In [26]:
def preprocess(df):
    for col in df.columns:
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if df[col].dtype == 'object':
                df[col].fillna('type 0', inplace=True)
                df[col] = df[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif df[col].dtype == 'bool':
                df[col] = df[col].astype(np.int8)
                
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['isweekend'] = (df['date'].dt.weekday >= 5).astype(int)
    df.drop('date', axis=1, inplace=True)
    
    return df

In [27]:
for df in [people, train, test]:
    df = preprocess(df)

In [28]:
people.head()

Unnamed: 0,people_id,char_1,group_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,...,char_33,char_34,char_35,char_36,char_37,char_38,year,month,day,isweekend
0,ppl_100,2,17304,2,5,5,5,3,11,2,...,0,1,1,1,0,36,2021,6,29,0
1,ppl_100002,2,8688,3,28,9,5,3,11,2,...,1,1,1,1,0,76,2021,1,6,0
2,ppl_100003,2,33592,3,4,8,5,2,5,2,...,1,1,0,1,1,99,2022,6,10,0
3,ppl_100004,2,22593,3,40,25,9,4,16,2,...,1,1,1,1,1,76,2022,7,20,0
4,ppl_100006,2,6534,3,40,25,9,3,8,2,...,0,0,1,1,0,84,2022,7,27,0


In [29]:
people.columns = [people.columns[0]] + ['p_' + c for c in people.columns[1:]]

In [30]:
train = pd.merge(train, people, how='left', on='people_id')
test = pd.merge(test, people, how='left', on='people_id')

In [31]:
train.shape[0]

2197291

In [32]:
train.activity_id.nunique()


2197291

In [33]:
train.set_index(train.activity_id, inplace=True)
test.set_index(test.activity_id, inplace=True)
train.head()

Unnamed: 0_level_0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,...,p_char_33,p_char_34,p_char_35,p_char_36,p_char_37,p_char_38,p_year,p_month,p_day,p_isweekend
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
act2_1734928,ppl_100,act2_1734928,4,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
act2_2434093,ppl_100,act2_2434093,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
act2_3404049,ppl_100,act2_3404049,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
act2_3651215,ppl_100,act2_3651215,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0
act2_4109017,ppl_100,act2_4109017,2,0,0,0,0,0,0,0,...,0,1,1,1,0,36,2021,6,29,0


In [34]:
test.head()

Unnamed: 0_level_0,people_id,activity_id,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,...,p_char_33,p_char_34,p_char_35,p_char_36,p_char_37,p_char_38,p_year,p_month,p_day,p_isweekend
activity_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
act1_249281,ppl_100004,act1_249281,1,5,10,5,1,6,1,1,...,1,1,1,1,1,76,2022,7,20,0
act2_230855,ppl_100004,act2_230855,5,0,0,0,0,0,0,0,...,1,1,1,1,1,76,2022,7,20,0
act1_240724,ppl_10001,act1_240724,1,12,1,5,4,6,1,1,...,1,1,1,1,1,90,2022,10,14,0
act1_83552,ppl_10001,act1_83552,1,20,10,5,4,6,1,1,...,1,1,1,1,1,90,2022,10,14,0
act2_1043301,ppl_10001,act2_1043301,5,0,0,0,0,0,0,0,...,1,1,1,1,1,90,2022,10,14,0


In [35]:
train.columns

Index([u'people_id', u'activity_id', u'activity_category', u'char_1',
       u'char_2', u'char_3', u'char_4', u'char_5', u'char_6', u'char_7',
       u'char_8', u'char_9', u'char_10', u'outcome', u'year', u'month', u'day',
       u'isweekend', u'p_char_1', u'p_group_1', u'p_char_2', u'p_char_3',
       u'p_char_4', u'p_char_5', u'p_char_6', u'p_char_7', u'p_char_8',
       u'p_char_9', u'p_char_10', u'p_char_11', u'p_char_12', u'p_char_13',
       u'p_char_14', u'p_char_15', u'p_char_16', u'p_char_17', u'p_char_18',
       u'p_char_19', u'p_char_20', u'p_char_21', u'p_char_22', u'p_char_23',
       u'p_char_24', u'p_char_25', u'p_char_26', u'p_char_27', u'p_char_28',
       u'p_char_29', u'p_char_30', u'p_char_31', u'p_char_32', u'p_char_33',
       u'p_char_34', u'p_char_35', u'p_char_36', u'p_char_37', u'p_char_38',
       u'p_year', u'p_month', u'p_day', u'p_isweekend'],
      dtype='object')

In [49]:
y = train['outcome']
train.drop('outcome', axis=1, inplace=True)

In [50]:
col_drop = ['people_id','activity_id']
train.drop(col_drop, axis=1, inplace=True)
test.drop(col_drop, axis=1, inplace=True)

In [51]:
def reduce_dimen(dataset,column,toreplace):
    for index,i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset

In [54]:
categorical=['p_group_1','activity_category',
             'p_char_1','p_char_2','p_char_3','p_char_4','p_char_5',
             'p_char_6','p_char_7','p_char_8','p_char_9',
             'char_2','char_3','char_4','char_5',
             'char_6','char_7','char_8','char_9']

In [55]:
for category in categorical:
    train=reduce_dimen(train,category,9999999)
    test=reduce_dimen(test,category,9999999)

In [56]:
not_categorical=[]
for category in train.columns:
    if category not in categorical:
        not_categorical.append(category)

In [57]:
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([train[categorical],test[categorical]]))
X_cat_sparse=enc.transform(train[categorical])
X_test_cat_sparse=enc.transform(test[categorical])

In [58]:
from scipy.sparse import hstack
X_sparse=hstack((train[not_categorical], X_cat_sparse))
X_test_sparse=hstack((test[not_categorical], X_test_cat_sparse))

In [59]:
print("Training data: " + format(X_sparse.shape))
print("Test data: " + format(X_test_sparse.shape))

Training data: (2197291, 31143)
Test data: (498687, 31143)


In [63]:
with open('./data/train_features_V1.pickle', 'wb') as f:
    pickle.dump(X_sparse, f, pickle.HIGHEST_PROTOCOL)

In [64]:
with open('./data/test_features_V1.pickle', 'wb') as f:
    pickle.dump(X_test_sparse, f, pickle.HIGHEST_PROTOCOL)

In [76]:
import xgboost as xgb
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [77]:
print("###########")
print("One Hot enconded Test Dataset Script")

dtrain = xgb.DMatrix(X_sparse,label=y)
dtest = xgb.DMatrix(X_test_sparse)

param = {'max_depth':10, 'eta':0.02, 'silent':1, 'objective':'binary:logistic' }
param['nthread'] = 4
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['booster'] = "gblinear"

watchlist  = [(dtrain,'train')]
num_round = 300
early_stopping_rounds=10
bst = xgb.train(param, dtrain, num_round, watchlist,early_stopping_rounds=early_stopping_rounds)

ypred = bst.predict(dtest)

###########
One Hot enconded Test Dataset Script


Will train until train error hasn't decreased in 10 rounds.
[0]	train-auc:0.888529
[1]	train-auc:0.896294
[2]	train-auc:0.904255
[3]	train-auc:0.912477
[4]	train-auc:0.920575
[5]	train-auc:0.928336
[6]	train-auc:0.935543
[7]	train-auc:0.942203
[8]	train-auc:0.948381
[9]	train-auc:0.954105
[10]	train-auc:0.959363
[11]	train-auc:0.964128
[12]	train-auc:0.968363
[13]	train-auc:0.972072
[14]	train-auc:0.975275
[15]	train-auc:0.978011
[16]	train-auc:0.980335
[17]	train-auc:0.982297
[18]	train-auc:0.983947
[19]	train-auc:0.985332
[20]	train-auc:0.986496
[21]	train-auc:0.987481
[22]	train-auc:0.988317
[23]	train-auc:0.989035
[24]	train-auc:0.989658
[25]	train-auc:0.990206
[26]	train-auc:0.990692
[27]	train-auc:0.991127
[28]	train-auc:0.991519
[29]	train-auc:0.991874
[30]	train-auc:0.992198
[31]	train-auc:0.992495
[32]	train-auc:0.992767
[33]	train-auc:0.993017
[34]	train-auc:0.993249
[35]	train-auc:0.993462
[36]	train-auc:0.993660
[37]	train-auc:0.993844
[38]	train-auc:0.994016
[39]	train-auc

In [79]:
ypred[0:10]

array([ 0.00134509,  0.00156169,  0.99946839,  0.99960226,  0.99945778,
        0.99945563,  0.99945778,  0.99945778,  0.99945778,  0.99946219], dtype=float32)

In [66]:
from ml_toolbox.kaggle import KaggleResult

In [67]:
# Load predictions based on data leak
not_leak_value = 0.123456
y_pred_leak = pd.read_csv('leak_abuse_V1.csv')

In [68]:
y_pred_leak.shape

(498687, 2)

In [69]:
y_pred_leak[y_pred_leak.outcome==not_leak_value].outcome.shape

(69073,)

In [70]:
ypred[y_pred_leak[y_pred_leak.outcome==not_leak_value].outcome.index].shape

NameError: name 'ypred' is not defined

In [34]:
y_pred_leak[y_pred_leak.outcome==not_leak_value].outcome = ypred[y_pred_leak[y_pred_leak.outcome==not_leak_value].outcome.index]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [65]:
kag = KaggleResult(y_pred_leak, test.index.values, 0.997242, 'test2', 'test2')

NameError: name 'KaggleResult' is not defined

In [63]:
print kag.validate()

(True, 'all_ok')


In [64]:
kag.upload()

0.978699