In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import load, dump

# load files

In [2]:
mode = "train"

In [3]:
# train of test
df = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/{}_shifted_rounded.parquet".format(mode))

In [4]:
all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2", "first_occurance", "time_id", "end_year_month"]]
nan_related_features = [
    "number_of_observations",
    "type_0_nan",
    "type_1_nan",
    "type_2_nan"
]
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
num_features = [col for col in all_cols if col not in (cat_features + nan_related_features)]

# base stat

In [5]:
base_stat_agg_features = df.groupby("customer_ID")[num_features].agg([ 
    "mean",
    "std", 
    "min", 
    "max",
    "count",
    "last"
])

base_stat_agg_features.columns = ["_".join(x) for x in base_stat_agg_features.columns]

In [6]:
def count_unique(series):
    return len(np.unique(series))

def count_duplicate_max(x):
    return np.sum(x == np.max(x))

def max_over_min(series):
    if len(series) < 2:
        return 0
    if np.min(series) == 0:
        return np.nan
    return np.max(series)/np.min(series)

In [7]:
count_unique_agg_features = df.groupby("customer_ID")[num_features].agg(count_unique)
count_unique_agg_features.columns = [(x + "_count_unique") for x in count_unique_agg_features.columns]

In [None]:
count_duplicate_max_agg_features = df.groupby("customer_ID")[num_features].agg(count_duplicate_max)
count_duplicate_max_agg_features.columns = [(x + "_count_duplicate_max") for x in count_duplicate_max_agg_features.columns]

In [None]:
max_over_min_agg_features = df.groupby("customer_ID")[num_features].agg(max_over_min)
max_over_min_agg_features.columns = [(x + "_max_over_min") for x in max_over_min_agg_features.columns]

In [None]:
base_stat_agg_features = pd.concat([
    base_stat_agg_features, 
    count_unique_agg_features, 
    count_duplicate_max_agg_features,
    max_over_min_agg_features
], axis=1)

In [None]:
base_stat_agg_features.head()

In [None]:
base_stat_agg_features.to_parquet("../input/{}_base_stat_agg_features.parquet".format(mode))

In [None]:
mode