In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from joblib import load, dump

# load files

In [2]:
mode = "train"

In [3]:
cat_agg_features = pd.read_parquet("../input/{}_cat_agg_features.parquet".format(mode))

In [4]:
basic_agg_features = pd.read_parquet("../input/{}_basic_agg_features.parquet".format(mode))

In [5]:
count_agg_features = pd.read_parquet("../input/{}_count_agg_features.parquet".format(mode))

In [6]:
sma_agg_features = pd.read_parquet("../input/{}_sma_agg_features.parquet".format(mode))

In [7]:
mean_diff_agg_features = pd.read_parquet("../input/{}_mean_diff_agg_features.parquet".format(mode))

In [8]:
std_diff_agg_features = pd.read_parquet("../input/{}_std_diff_agg_features.parquet".format(mode))

In [9]:
max_drawdown_agg_features = pd.read_parquet("../input/{}_max_drawdown_agg_features.parquet".format(mode))

In [10]:
max_drawup_agg_features = pd.read_parquet("../input/{}_max_drawup_agg_features.parquet".format(mode))

In [11]:
drawdown_duration_agg_features = pd.read_parquet("../input/{}_drawdown_duration_agg_features.parquet".format(mode))

In [12]:
drawup_duration_agg_features = pd.read_parquet("../input/{}_drawup_duration_agg_features.parquet".format(mode))

In [13]:
full_agg_features = pd.concat([
    cat_agg_features, 
    basic_agg_features,
    count_agg_features,
    sma_agg_features,
    mean_diff_agg_features,
    std_diff_agg_features,
    max_drawup_agg_features,
    max_drawdown_agg_features,
    drawdown_duration_agg_features,
    drawup_duration_agg_features
], axis=1)

In [14]:
full_agg_features.shape

(458913, 3396)

In [15]:
full_agg_features.head()

Unnamed: 0_level_0,B_30_count,B_30_last,B_30_nunique,B_38_count,B_38_last,B_38_nunique,D_114_count,D_114_last,D_114_nunique,D_116_count,...,D_136_drawup_duration,D_137_drawup_duration,D_138_drawup_duration,D_139_drawup_duration,D_140_drawup_duration,D_141_drawup_duration,D_142_drawup_duration,D_143_drawup_duration,D_144_drawup_duration,D_145_drawup_duration
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223358381327749917,13,0.0,2,13,7.0,3,13,1.0,1,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
-9223193039457028513,13,0.0,1,13,1.0,1,13,1.0,1,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-9223189665817919541,13,0.0,1,13,1.0,1,13,0.0,1,13,...,0.0,0.0,0.0,0.0,0.0,7.0,12.0,0.0,9.0,0.0
-9223188534444851899,13,0.0,1,13,1.0,1,13,0.0,1,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
-9223173911659837606,13,1.0,1,13,7.0,2,13,1.0,2,13,...,4.0,4.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


# Add target

In [16]:
def add_target(df):
    
    targets = pd.read_csv("../input/train_labels.csv")
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    targets["customer_ID"] = targets["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    targets[["customer_ID"]] = targets[["customer_ID"]].applymap(lambda x: int(x, 16))
    targets["customer_ID"] = targets["customer_ID"].astype("int64")
    
    targets = targets.set_index("customer_ID")
    
    df = df.merge(targets, left_index=True, right_index=True, how="left")
    df.target = df.target.astype("int8")
    
    return df

In [17]:
if mode == "train":
    full_agg_features = add_target(full_agg_features)

# label encoding

In [None]:
def label_encoding(df):
    
    cat_features_base = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ] 
    cat_features = [
        "{}_last".format(feature) for feature in cat_features_base
    ]
    
    for feature in cat_features:
        encoder = LabelEncoder()
        df[feature] = encoder.fit_transform(df[feature])
    
    return df

In [None]:
full_agg_features = label_encoding(full_agg_features)

In [None]:
full_agg_features = full_agg_features.reset_index()

In [None]:
full_agg_features.head()

In [None]:
columns = full_agg_features.columns.tolist()

In [None]:
len(columns)

In [None]:
len(set(columns))

In [None]:
full_agg_features.to_parquet("../input/{}_full_features.parquet".format(mode))