In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import load, dump

# load files

In [2]:
mode = "test"

In [3]:
# train of test
df = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/{}_shifted_rounded.parquet".format(mode))

In [4]:
all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2", "first_occurance", "time_id", "end_year_month"]]
nan_related_features = [
    "number_of_observations",
    "type_0_nan",
    "type_1_nan",
    "type_2_nan"
]
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
num_features = [col for col in all_cols if col not in (cat_features + nan_related_features)]

# max drawdown

In [5]:
def max_drawdown(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0
    k = series[np.argmax(np.maximum.accumulate(series) - series)]
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i])<1:
        return np.NaN
    else:
        j = np.max(series[:i])
    return j-k

def max_drawup(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0

    series = - series
    k = series[np.argmax(np.maximum.accumulate(series) - series)]
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i])<1:
        return np.NaN
    else:
        j = np.max(series[:i])
    return j-k

def drawdown_duration(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0

    k = np.argmax(np.maximum.accumulate(series) - series)
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i]) == 0:
        j=k
    else:
        j = np.argmax(series[:i])
    return k-j

def drawup_duration(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0

    series=-series
    k = np.argmax(np.maximum.accumulate(series) - series)
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i]) == 0:
        j=k
    else:
        j = np.argmax(series[:i])
    return k-j

In [None]:
max_drawdown_agg_features = df.groupby("customer_ID")[num_features].agg(max_drawdown)
max_drawdown_agg_features.columns = [(x + "_max_drawdown") for x in max_drawdown_agg_features.columns]

In [None]:
max_drawdown_agg_features.head()

In [None]:
max_drawdown_agg_features.to_parquet("../input/{}_max_drawdown_agg_features.parquet".format(mode))

In [None]:
max_drawup_agg_features = df.groupby("customer_ID")[num_features].agg(max_drawup)
max_drawup_agg_features.columns = [(x + "_max_drawup") for x in max_drawup_agg_features.columns]

In [None]:
max_drawup_agg_features.head()

In [None]:
max_drawup_agg_features.to_parquet("../input/{}_max_drawup_agg_features.parquet".format(mode))

In [6]:
drawdown_duration_agg_features = df.groupby("customer_ID")[num_features].agg(drawdown_duration)
drawdown_duration_agg_features.columns = [(x + "_drawdown_duration") for x in drawdown_duration_agg_features.columns]

In [7]:
drawdown_duration_agg_features.head()

Unnamed: 0_level_0,P_2_drawdown_duration,D_39_drawdown_duration,B_1_drawdown_duration,B_2_drawdown_duration,R_1_drawdown_duration,S_3_drawdown_duration,D_41_drawdown_duration,B_3_drawdown_duration,D_42_drawdown_duration,D_43_drawdown_duration,...,D_136_drawdown_duration,D_137_drawdown_duration,D_138_drawdown_duration,D_139_drawdown_duration,D_140_drawdown_duration,D_141_drawdown_duration,D_142_drawdown_duration,D_143_drawdown_duration,D_144_drawdown_duration,D_145_drawdown_duration
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223277493928322471,1.0,1,4.0,3.0,0.0,0.0,0.0,3.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-9223220269070810982,8.0,0,1.0,1.0,0.0,6.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
-9223219380479694318,1.0,1,2.0,1.0,0.0,7.0,0.0,7.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-9223202973368451495,2.0,2,2.0,6.0,0.0,2.0,0.0,4.0,0.0,12.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
-9223190037945288673,2.0,1,1.0,1.0,0.0,5.0,0.0,2.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0


In [8]:
drawdown_duration_agg_features.to_parquet("../input/{}_drawdown_duration_agg_features.parquet".format(mode))

In [9]:
drawup_duration_agg_features = df.groupby("customer_ID")[num_features].agg(drawup_duration)
drawup_duration_agg_features.columns = [(x + "_drawup_duration") for x in drawup_duration_agg_features.columns]

In [10]:
drawup_duration_agg_features.head()

Unnamed: 0_level_0,P_2_drawup_duration,D_39_drawup_duration,B_1_drawup_duration,B_2_drawup_duration,R_1_drawup_duration,S_3_drawup_duration,D_41_drawup_duration,B_3_drawup_duration,D_42_drawup_duration,D_43_drawup_duration,...,D_136_drawup_duration,D_137_drawup_duration,D_138_drawup_duration,D_139_drawup_duration,D_140_drawup_duration,D_141_drawup_duration,D_142_drawup_duration,D_143_drawup_duration,D_144_drawup_duration,D_145_drawup_duration
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223277493928322471,5.0,3,2.0,1.0,0.0,0.0,0.0,4.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-9223220269070810982,1.0,0,7.0,8.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0
-9223219380479694318,10.0,3,4.0,2.0,0.0,5.0,0.0,4.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
-9223202973368451495,10.0,0,1.0,1.0,0.0,5.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,5.0,0.0,5.0,0.0,5.0,1.0,5.0
-9223190037945288673,3.0,12,4.0,5.0,0.0,4.0,0.0,10.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0


In [11]:
drawup_duration_agg_features.to_parquet("../input/{}_drawup_duration_agg_features.parquet".format(mode))