In [266]:
import numpy as np
import pandas as pd
import xgboost as xgb

from xgboost.sklearn import XGBClassifier

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn.model_selection  import StratifiedKFold, KFold, train_test_split
from scipy.stats import randint, uniform
from sklearn.metrics import roc_auc_score

import datetime
import random
from operator import itemgetter
import time
import copy

In [267]:
act_train_data = pd.read_csv("datasets/act_train.csv", dtype={'people_id':np.str, 'activity_id': np.str, 'outcome': np.int8}, parse_dates=['date'])
act_test_data  = pd.read_csv("datasets/act_test.csv", dtype={'people_id': np.str, 'activity_id': np.str}, parse_dates=['date'])
people_data = pd.read_csv("datasets/people.csv", dtype={'people_id': np.str,'activity_id': np.str, 'char_38':np.int32}, parse_dates=['date'])


In [268]:
act_train_data.drop('char_10', axis=1)
act_test_data.drop('char_10', axis=1)

Unnamed: 0,people_id,activity_id,date,activity_category,char_1,char_2,char_3,char_4,char_5,char_6,char_7,char_8,char_9
0,ppl_100004,act1_249281,2022-07-20,type 1,type 5,type 10,type 5,type 1,type 6,type 1,type 1,type 7,type 4
1,ppl_100004,act2_230855,2022-07-20,type 5,,,,,,,,,
2,ppl_10001,act1_240724,2022-10-14,type 1,type 12,type 1,type 5,type 4,type 6,type 1,type 1,type 13,type 10
3,ppl_10001,act1_83552,2022-11-27,type 1,type 20,type 10,type 5,type 4,type 6,type 1,type 1,type 5,type 5
4,ppl_10001,act2_1043301,2022-10-15,type 5,,,,,,,,,
5,ppl_10001,act2_112890,2022-11-27,type 5,,,,,,,,,
6,ppl_10001,act2_1169930,2022-10-15,type 5,,,,,,,,,
7,ppl_10001,act2_1924448,2022-10-15,type 5,,,,,,,,,
8,ppl_10001,act2_1953554,2022-10-15,type 5,,,,,,,,,
9,ppl_10001,act2_1971739,2022-11-28,type 5,,,,,,,,,


In [269]:
print("Train data shape: " + format(act_train_data.shape))
print("Test data shape: " + format(act_test_data.shape))
print("People data shape: " + format(people_data.shape))

Train data shape: (2197291, 15)
Test data shape: (498687, 14)
People data shape: (189118, 41)


In [270]:
def act_data_treatment(dsname):
    dataset = dsname
    
    for col in list(dataset.columns):
        if col not in ['people_id', 'activity_id', 'date', 'char_38', 'outcome']:
            if dataset[col].dtype == 'object':
                dataset[col].fillna('type 0', inplace=True)
                dataset[col] = dataset[col].apply(lambda x: x.split(' ')[1]).astype(np.int32)
            elif dataset[col].dtype == 'bool':
                dataset[col] = dataset[col].astype(np.int8)
    
    dataset = dataset.drop('date', axis =1)
    return dataset


In [271]:
act_train_data  = act_data_treatment(act_train_data)
act_test_data = act_data_treatment(act_test_data)
people_data = act_data_treatment(people_data)

In [272]:
#merge people and train dataset
train = act_train_data.merge(people_data, on='people_id', how='left', left_index=True)
test  = act_test_data.merge(people_data, on='people_id', how='left', left_index=True)

In [273]:
train.columns

Index(['people_id', 'activity_id', 'activity_category', 'char_1_x', 'char_2_x',
       'char_3_x', 'char_4_x', 'char_5_x', 'char_6_x', 'char_7_x', 'char_8_x',
       'char_9_x', 'char_10_x', 'outcome', 'char_1_y', 'group_1', 'char_2_y',
       'char_3_y', 'char_4_y', 'char_5_y', 'char_6_y', 'char_7_y', 'char_8_y',
       'char_9_y', 'char_10_y', 'char_11', 'char_12', 'char_13', 'char_14',
       'char_15', 'char_16', 'char_17', 'char_18', 'char_19', 'char_20',
       'char_21', 'char_22', 'char_23', 'char_24', 'char_25', 'char_26',
       'char_27', 'char_28', 'char_29', 'char_30', 'char_31', 'char_32',
       'char_33', 'char_34', 'char_35', 'char_36', 'char_37', 'char_38'],
      dtype='object')

In [274]:
del act_train_data
del act_test_data
del people_data

In [275]:
train = train.sort_values(['people_id'], ascending=[1])
test = test.sort_values(['people_id'], ascending=[1])#test=test.sort_values(['people_id'], ascending=[1])

In [276]:
train_columns = train.columns.values
test_columns = test.columns.values
features = list(set(train_columns) & set(test_columns))

In [277]:
len(feature)

52

In [278]:
train.shape

(2197291, 53)

In [279]:
test.shape

(498687, 52)

In [280]:
train.fillna('NA', inplace=True)
test.fillna('NA', inplace=True)
y= train.outcome
train=train.drop('outcome', axis=1)

In [281]:
whole = pd.concat([train,test], ignore_index=True)


In [282]:
def reduce_dimen(dataset,column,toreplace):
    for index,i in dataset[column].duplicated(keep=False).iteritems():
        if i==False:
            dataset.set_value(index,column,toreplace)
    return dataset

In [283]:
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
for category in categorical:
    whole=reduce_dimen(whole,category,9999999)

  after removing the cwd from sys.path.


In [284]:
X = whole[:len(train)]
X_test = whole[len(train):]

In [285]:
del train
del whole

In [286]:
X.shape

(2197291, 52)

In [287]:
X_test.shape

(498687, 52)

In [288]:
X=X.sort_values(['people_id'],ascending=[1])
X = X[features].drop(['people_id', 'activity_id', 'char_1_y'], axis=1)

In [289]:
#list categorical features and label them
from sklearn.preprocessing import LabelEncoder

In [293]:
categorical=['group_1','activity_category','char_1_x','char_2_x','char_3_x','char_4_x','char_5_x','char_6_x','char_7_x','char_8_x','char_9_x','char_2_y','char_3_y','char_4_y','char_5_y','char_6_y','char_7_y','char_8_y','char_9_y']
not_categorical=[]
for category in X.columns:
    if category not in categorical:
        not_categorical.append(category)
    else:
        temp = pd.concat([X[category], X_test[category]])
        le=LabelEncoder()
        le.fit(temp.values)
        X[category] = le.transform(X[category].values)
        X_test[category] = le.transform(X_test[category].values) 
        
enc = OneHotEncoder(handle_unknown='ignore')
enc=enc.fit(pd.concat([X[categorical],X_test[categorical]]))
X_cat_sparse=enc.transform(X[categorical])
X_test_cat_sparse=enc.transform(X_test[categorical])

In [294]:
X[not_categorical].shape, X[categorical].shape, X_test[not_categorical].shape, X_test[categorical].shape



((2197291, 30), (2197291, 19), (498687, 30), (498687, 19))

In [295]:
from scipy.sparse import hstack
X_sparse=hstack((X[not_categorical], X_cat_sparse))
X_test_sparse=hstack((X_test[not_categorical], X_test_cat_sparse))

print("Training data: " + format(X_sparse.shape))
print("Test data: " + format(X_test_sparse.shape))
print("###########")
print("One Hot enconded Test Dataset Script")

Training data: (2197291, 31263)
Test data: (498687, 31263)
###########
One Hot enconded Test Dataset Script


In [296]:
dtrain = xgb.DMatrix(X_sparse,label=y)
dtest = xgb.DMatrix(X_test_sparse)

param = {'max_depth':10, 'eta':0.02, 'silent':0, 'objective':'binary:logistic' }
#param['nthread'] = 2
param['eval_metric'] = 'auc'
param['subsample'] = 0.7
param['colsample_bytree']= 0.7
param['min_child_weight'] = 0
param['booster'] = "gblinear"

  if getattr(data, 'base', None) is not None and \
  data.base is not None and isinstance(data, np.ndarray) \


In [None]:
np.random.seed(120)
evals  = [(dtrain,'train')]
num_round = 305
bst = xgb.train(param, dtrain, num_round, evals, early_stopping_rounds=10, verbose_eval=10)

In [None]:
ypred=bst.predict(dtest)
output=pd.DataFrame({'activity_id': test['activity_id'],'outcome':ypred})
output.head()
output.to_csv('without_leak.csv', index = False)
!zip subb.zip without_leak.csv