In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from joblib import load, dump

# load files

In [2]:
mode = "test"

In [3]:
cat_agg_features = pd.read_parquet("../input/{}_cat_agg_features.parquet".format(mode))

In [4]:
base_stat_agg_features = pd.read_parquet("../input/{}_base_stat_agg_features.parquet".format(mode))

In [5]:
quantile_agg_features = pd.read_parquet("../input/{}_quantile_agg_features.parquet".format(mode))

In [6]:
skew_agg_features = pd.read_parquet("../input/{}_skew_agg_features.parquet".format(mode))

In [7]:
kurtosis_agg_features = pd.read_parquet("../input/{}_kurtosis_agg_features.parquet".format(mode))

In [8]:
sma_agg_features = pd.read_parquet("../input/{}_sma_agg_features.parquet".format(mode))

In [9]:
sma_agg_features.head()

Unnamed: 0_level_0,P_2_nanmean_4,D_39_nanmean_4,B_1_nanmean_4,B_2_nanmean_4,R_1_nanmean_4,S_3_nanmean_4,D_41_nanmean_4,B_3_nanmean_4,D_42_nanmean_4,D_43_nanmean_4,...,D_136_nanmean_10,D_137_nanmean_10,D_138_nanmean_10,D_139_nanmean_10,D_140_nanmean_10,D_141_nanmean_10,D_142_nanmean_10,D_143_nanmean_10,D_144_nanmean_10,D_145_nanmean_10
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223277493928322471,0.745556,0.0,0.004444,0.837778,0.0,0.196667,0.0,0.006667,,0.117778,...,,,,0.0,0.0,0.0,,0.0,0.006667,0.0
-9223220269070810982,0.841111,0.0,0.017778,0.836667,0.0,0.01,0.0,0.024444,,0.081667,...,,,,0.0,0.0,0.0,,0.0,0.006667,0.0
-9223219380479694318,0.781111,0.0,0.007778,0.875556,0.0,0.263333,0.0,0.011111,,0.11125,...,,,,0.0,0.0,0.0,,0.0,0.0,0.0
-9223202973368451495,0.73,0.0,0.005556,0.815556,0.0,0.163333,0.0,0.006667,,0.051111,...,,,,1.0,0.0,0.86,0.0,1.0,0.01,2.0
-9223190037945288673,0.953333,4.333333,0.037778,1.004444,0.0,0.156667,0.0,0.008889,,0.01,...,,,,0.0,0.0,0.0,,0.0,0.006667,0.0


In [10]:
# full_agg_features = pd.concat([
#     cat_agg_features, 
#     base_stat_agg_features, 
#     quantile_agg_features, 
#     skew_agg_features, 
#     kurtosis_agg_features
# ], axis=1)

full_agg_features = pd.concat([
    cat_agg_features, 
    base_stat_agg_features,
    sma_agg_features,
    quantile_agg_features,
    skew_agg_features,
    kurtosis_agg_features
], axis=1)

In [11]:
full_agg_features.shape

(924621, 3042)

# Add target

In [12]:
def add_target(df):
    
    targets = pd.read_csv("../input/train_labels.csv")
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    targets["customer_ID"] = targets["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    targets[["customer_ID"]] = targets[["customer_ID"]].applymap(lambda x: int(x, 16))
    targets["customer_ID"] = targets["customer_ID"].astype("int64")
    
    targets = targets.set_index("customer_ID")
    
    df = df.merge(targets, left_index=True, right_index=True, how="left")
    df.target = df.target.astype("int8")
    
    return df

In [13]:
if mode == "train":
    full_agg_features = add_target(full_agg_features)

# label encoding

In [14]:
def label_encoding(df):
    
    cat_features_base = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ] 
    cat_features = [
        "{}_last".format(feature) for feature in cat_features_base
    ]
    
    for feature in cat_features:
        encoder = LabelEncoder()
        df[feature] = encoder.fit_transform(df[feature])
    
    return df

In [15]:
full_agg_features = label_encoding(full_agg_features)

In [16]:
full_agg_features = full_agg_features.reset_index()

In [17]:
full_agg_features.head()

Unnamed: 0,customer_ID,B_30_count,B_30_last,B_30_nunique,B_38_count,B_38_last,B_38_nunique,D_114_count,D_114_last,D_114_nunique,...,D_136_kurtosis,D_137_kurtosis,D_138_kurtosis,D_139_kurtosis,D_140_kurtosis,D_141_kurtosis,D_142_kurtosis,D_143_kurtosis,D_144_kurtosis,D_145_kurtosis
0,-9223277493928322471,13,0,1,13,0,3,13,1,1,...,,,,0.0,0.0,0.0,,0.0,-2.363636,0.0
1,-9223220269070810982,13,0,1,13,1,1,13,0,1,...,,,,0.0,0.0,0.0,,0.0,-2.056364,0.0
2,-9223219380479694318,13,0,1,13,0,1,13,1,1,...,,,,0.0,0.0,0.0,,0.0,0.094545,0.0
3,-9223202973368451495,13,0,1,13,0,3,13,0,1,...,,,,-2.056364,0.0,-2.056364,-0.988693,-2.056364,0.094546,-2.056364
4,-9223190037945288673,13,0,1,13,1,1,13,1,1,...,,,,0.0,0.0,0.0,,0.0,0.094545,0.0


In [18]:
columns = full_agg_features.columns.tolist()

In [19]:
len(columns)

3043

In [20]:
len(set(columns))

3043

In [21]:
full_agg_features.to_parquet("../input/{}_full_features.parquet".format(mode))