In [1]:
import os, subprocess
import pandas as pd
import numpy as np
from IPython.display import display

# Combine features

In [2]:
# check features folders
subprocess.check_output(["ls", "features"]).splitlines()

['baseline_extend.pkl.bz2',
 'baseline.pkl.bz2',
 'bureau_balance.pkl.bz2',
 'bureau.pkl.bz2',
 'pdf_features_label.pkl.bz2']

In [3]:
# specified features set for joining
ls_feat_file = [
    "baseline.pkl.bz2",
    "baseline_extend.pkl.bz2",
    "bureau.pkl.bz2",
    "bureau_balance.pkl.bz2"
]

In [4]:
%%time
# use first features for base joined
feat_path = os.path.join("features", ls_feat_file[0])
pdf_combined = pd.read_pickle(feat_path, compression="bz2")

# join next features set
for fname in ls_feat_file[1:]:
    feat_path = os.path.join("features", fname)
    pdf_feat = pd.read_pickle(feat_path)
    pdf_combined = pdf_combined.merge(pdf_feat, on="SK_ID_CURR", how="left")

print("rows, columns", pdf_combined.shape)
pdf_combined.head()   
ls_features = [feat for feat in pdf_combined.columns if feat not in ["SK_ID_CURR"]]

('rows, columns', (356255, 354))
CPU times: user 16.1 s, sys: 1.82 s, total: 17.9 s
Wall time: 11.7 s


# join with label

In [5]:
pdf_tvt = pd.read_pickle("pdf_tvt_extend.pkl", compression="bz2")
print(pdf_tvt.shape)
display(pdf_tvt.head())

(356255, 3)


Unnamed: 0,SK_ID_CURR,TARGET,tvt_code
0,100002,1,train
1,100003,0,train
2,100004,0,train
3,100006,0,train
4,100007,0,train


In [6]:
pdf_tvt["tvt_code"].value_counts()

train          216948
kaggle_test     48744
test            46127
val             44436
Name: tvt_code, dtype: int64

In [7]:
pdf_features_label = pdf_tvt.merge(pdf_combined, on="SK_ID_CURR", how="left")
print(pdf_features_label.shape)
display(pdf_features_label.head().T)

(356255, 356)


Unnamed: 0,0,1,2,3,4
SK_ID_CURR,100002,100003,100004,100006,100007
TARGET,1,0,0,0,0
tvt_code,train,train,train,train,train
is_FLAG_EMP_PHONE,1,1,1,1,1
is_FLAG_WORK_PHONE,0,0,1,0,0
is_FLAG_PHONE,1,1,1,0,0
is_FLAG_EMAIL,0,0,0,0,0
is_REG_REGION_NOT_LIVE_REGION,0,0,0,0,0
is_REG_REGION_NOT_WORK_REGION,0,0,0,0,0
is_LIVE_REGION_NOT_WORK_REGION,0,0,0,0,0


In [8]:
%%time
# save combined features with label
pdf_features_label.to_pickle(os.path.join("features", "pdf_features_label.pkl.bz2"), compression="bz2")

CPU times: user 46.9 s, sys: 845 ms, total: 47.7 s
Wall time: 47.8 s
