In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import load, dump

# load files

In [2]:
mode = "test"

In [3]:
# train of test
df = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/{}_shifted_rounded.parquet".format(mode))

In [4]:
all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2", "first_occurance", "time_id", "end_year_month"]]
nan_related_features = [
    "number_of_observations",
    "type_0_nan",
    "type_1_nan",
    "type_2_nan"
]
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
num_features = [col for col in all_cols if col not in (cat_features + nan_related_features)]

# max drawdown

In [5]:
def max_drawdown(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0
    k = series[np.argmax(np.maximum.accumulate(series) - series)]
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i])<1:
        return np.NaN
    else:
        j = np.max(series[:i])
    return j-k

def max_drawup(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0

    series = - series
    k = series[np.argmax(np.maximum.accumulate(series) - series)]
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i])<1:
        return np.NaN
    else:
        j = np.max(series[:i])
    return j-k

def drawdown_duration(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0

    k = np.argmax(np.maximum.accumulate(series) - series)
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i]) == 0:
        j=k
    else:
        j = np.argmax(series[:i])
    return k-j

def drawup_duration(series):
    series = np.asarray(series)
    if len(series)<2:
        return 0

    series=-series
    k = np.argmax(np.maximum.accumulate(series) - series)
    i = np.argmax(np.maximum.accumulate(series) - series)
    if len(series[:i]) == 0:
        j=k
    else:
        j = np.argmax(series[:i])
    return k-j

In [None]:
max_drawdown_agg_features = df.groupby("customer_ID")[num_features].agg(max_drawdown)
max_drawdown_agg_features.columns = [(x + "_max_drawdown") for x in max_drawdown_agg_features.columns]

In [None]:
max_drawdown_agg_features.head()

In [None]:
max_drawdown_agg_features.to_parquet("../input/{}_max_drawdown_agg_features.parquet".format(mode))

In [None]:
max_drawup_agg_features = df.groupby("customer_ID")[num_features].agg(max_drawup)
max_drawup_agg_features.columns = [(x + "_max_drawup") for x in max_drawup_agg_features.columns]

In [None]:
max_drawup_agg_features.head()

In [None]:
max_drawup_agg_features.to_parquet("../input/{}_max_drawup_agg_features.parquet".format(mode))

In [None]:
drawdown_duration_agg_features = df.groupby("customer_ID")[num_features].agg(drawdown_duration)
drawdown_duration_agg_features.columns = [(x + "_drawdown_duration") for x in drawdown_duration_agg_features.columns]

In [None]:
drawdown_duration_agg_features.head()

In [None]:
drawdown_duration_agg_features.to_parquet("../input/{}_drawdown_duration_agg_features.parquet".format(mode))

In [None]:
drawup_duration_agg_features = df.groupby("customer_ID")[num_features].agg(drawup_duration)
drawup_duration_agg_features.columns = [(x + "_drawup_duration") for x in drawup_duration_agg_features.columns]

In [None]:
drawup_duration_agg_features.head()

In [None]:
drawup_duration_agg_features.to_parquet("../input/{}_drawup_duration_agg_features.parquet".format(mode))