In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from joblib import load, dump

# load files

In [None]:
def load_file(
    path="", 
    usecols=None
):
    # LOAD DATAFRAME
    if usecols is not None: 
        df = pd.read_parquet(path, columns=usecols)
    else: 
        df = pd.read_parquet(path)
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df["customer_ID"] = df["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    df[["customer_ID"]] = df[["customer_ID"]].applymap(lambda x: int(x, 16))
    
    df["customer_ID"] = df["customer_ID"].astype("int64")
    df["S_2"] = pd.to_datetime(df["S_2"])
    
    # SORT BY CUSTOMER AND DATE (so agg("last") works correctly)
    df = df.sort_values(["customer_ID", "S_2"])
    df = df.reset_index(drop=True)
    
    # FILL NAN
    print("shape of data:", df.shape)
    
    return df

In [None]:
print("Reading data...")
TRAIN_PATH = "../input/amex-data-integer-dtypes-parquet-format/train.parquet"
train = load_file(path = TRAIN_PATH)

TEST_PATH = "../input/amex-data-integer-dtypes-parquet-format/test.parquet"
test = load_file(path = TEST_PATH)

In [None]:
int_cols = train.select_dtypes(include=[np.int8, np.int16, np.int32, np.int64]).columns.tolist()
int_cols = [col for col in int_cols if col not in ["customer_ID"]]
dump(int_cols, "int_cols.pkl")

In [None]:
train["D_39"].value_counts()

In [None]:
# revert to nan
train[train==-1] = np.nan
test[test==-1] = np.nan

In [None]:
train.head()

In [None]:
train.shape

# add number of observations

In [None]:
def add_observation(df):
    
    df["number_of_observations"] = df.groupby("customer_ID")["customer_ID"].transform("count")
    
    return df

In [None]:
train = add_observation(train)
test = add_observation(test)

# add first occurance flag

In [None]:
def add_first_occurance(df):
    
    df["index"] = df.index.tolist()
    first_occurance_index = df[["customer_ID", "index"]].groupby("customer_ID").first()["index"].tolist()
    
    df["first_occurance"] = 0
    df.loc[df["index"].isin(first_occurance_index), "first_occurance"] = 1
    
    df = df.drop(["index"], axis=1)
    
    return df

In [None]:
train = add_first_occurance(train)
test = add_first_occurance(test)

# process nan

In [None]:
# get nan clusters first
cols = sorted(train.columns[2:].tolist())
nas = train[cols].isna().sum(axis=0).reset_index(name="NA_count")
nas["group_count"] = nas.loc[nas.NA_count > 0].groupby("NA_count").transform("count")
clusters = nas.loc[nas.group_count > 10].sort_values(["NA_count","index"]).groupby("NA_count")["index"].apply(list).values

In [None]:
clusters[0]

In [None]:
for col in clusters[2]:
    if col in int_cols:
        print(col)

In [None]:
def process_type_0_nan(df, cluster):
    
    type_0_nan_customers = df.loc[df[cluster[0]].isnull(), "customer_ID"].unique().tolist()
    df.loc[df["customer_ID"].isin(type_0_nan_customers), cluster] = df.loc[df["customer_ID"].isin(type_0_nan_customers), cluster].fillna(0)
    
    return df

In [None]:
train = process_type_0_nan(train, clusters[0])
test = process_type_0_nan(test, clusters[0])

In [None]:
def process_type_1_nan(df, cluster):
    
    type_1_nan_customers_group_0 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 0), "customer_ID"].unique().tolist()
    type_1_nan_customers_group_1 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 1), "customer_ID"].unique().tolist()
    
    # fill group 1 by 0
    df.loc[df["customer_ID"].isin(type_1_nan_customers_group_1), cluster] = \
        df.loc[df["customer_ID"].isin(type_1_nan_customers_group_1), cluster].fillna(0)
    
    # fill group 0 by mean of t - 1 and t + 1
    ffill = df[["customer_ID"] + cluster].copy()
    bfill = df[["customer_ID"] + cluster].copy()
    
    ffill[cluster] = ffill[cluster].fillna(method="ffill")
    bfill[cluster] = bfill[cluster].fillna(method="bfill")
    
    df.loc[df["customer_ID"].isin(type_1_nan_customers_group_0), cluster] = \
        (ffill.loc[ffill["customer_ID"].isin(type_1_nan_customers_group_0), cluster] + \
         bfill.loc[bfill["customer_ID"].isin(type_1_nan_customers_group_0), cluster]) / 2
    
    return df

In [None]:
train = process_type_1_nan(train, clusters[1])
test = process_type_1_nan(test, clusters[1])

In [None]:
def process_type_2_nan(df, cluster):
    
    type_2_nan_customers_group_0 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 0), "customer_ID"].unique().tolist()
    type_2_nan_customers_group_1 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 1), "customer_ID"].unique().tolist()
    
    ffill = df[["customer_ID"] + cluster].copy()
    bfill = df[["customer_ID"] + cluster].copy()
    
    ffill[cluster] = ffill[cluster].fillna(method="ffill")
    bfill[cluster] = bfill[cluster].fillna(method="bfill")
    
#     # fill group 1 by bfill
#     df.loc[df["customer_ID"].isin(type_2_nan_customers_group_1), cluster] = \
#         bfill.loc[bfill["customer_ID"].isin(type_2_nan_customers_group_1), cluster]

    # fill group 1 by 0
    df.loc[df["customer_ID"].isin(type_2_nan_customers_group_1), cluster] = \
        df.loc[df["customer_ID"].isin(type_2_nan_customers_group_1), cluster].fillna(0)
    
    # fill group 0 by mean of fill and bfill
    df.loc[df["customer_ID"].isin(type_2_nan_customers_group_0), cluster] = \
        (ffill.loc[ffill["customer_ID"].isin(type_2_nan_customers_group_0), cluster] + \
         bfill.loc[bfill["customer_ID"].isin(type_2_nan_customers_group_0), cluster]) / 2
    
    return df

In [None]:
train = process_type_2_nan(train, clusters[2])
test = process_type_2_nan(test, clusters[2])

# add time id

In [None]:
def add_time_id(df):
    
    df["time_id"] = df.groupby(["customer_ID"]).cumcount()
    
    return df

In [None]:
train = add_time_id(train)
test = add_time_id(test)

# add end_year_month

In [None]:
def add_end_year_month(df):
    
    df["end_year_month"] = df["S_2"].dt.to_period("M")
    df["end_year_month"] = df.groupby("customer_ID")["end_year_month"].transform("last")
    
    return df

In [None]:
train = add_end_year_month(train)
test = add_end_year_month(test)

In [None]:
test["end_year_month"] .value_counts()

In [None]:
train.to_parquet("../input/amex-data-integer-dtypes-parquet-format/train_fillna.parquet")
test.to_parquet("../input/amex-data-integer-dtypes-parquet-format/test_fillna.parquet")

# feature adjustment

In [2]:
int_cols = load("int_cols.pkl")

In [3]:
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train_fillna.parquet")
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test_fillna.parquet")

In [4]:
shift_features = [
    "D_42",
    "D_52",
    "D_59",
    "D_79",
    "D_93",
    "D_105",
    "D_116",
    "D_122",
    "D_130",
    "D_133",
    "D_142",
    "S_11",
    "B_36"
]

outlier_features = [
    "D_106",
    "S_23",
    "B_10",
]
outlier_features = [feature for feature in outlier_features if feature not in int_cols]

test_base_outlier_features = [
    "D_102",
    "D_109",
    "D_144",
    "B_6",
    "B_40"
]
test_base_outlier_features = [feature for feature in test_base_outlier_features if feature not in int_cols]

test_public_base_outlier_features = [
    "D_69"
]
test_public_base_outlier_features = [feature for feature in test_public_base_outlier_features if feature not in int_cols]

test_private_base_outlier_features = [
    "S_18"
]
test_private_base_outlier_features = [feature for feature in test_private_base_outlier_features if feature not in int_cols]

In [5]:
test_public_end_year_month = test["end_year_month"].iloc[0]
test_private_end_year_month = test["end_year_month"].iloc[-1]

In [6]:
for col in shift_features:
    if col in int_cols:
        print(col)

D_59
D_79
D_93
D_116
D_122
S_11


In [7]:
# shift features
test.loc[test["end_year_month"] == test_public_end_year_month, shift_features] = \
    test.loc[test["end_year_month"] == test_public_end_year_month, shift_features] - \
    np.nanmean(test.loc[test["end_year_month"] == test_public_end_year_month, shift_features], axis=0) + \
    np.nanmean(train[shift_features], axis=0)

test.loc[test["end_year_month"] == test_private_end_year_month, shift_features] = \
    test.loc[test["end_year_month"] == test_private_end_year_month, shift_features] - \
    np.nanmean(test.loc[test["end_year_month"] == test_private_end_year_month, shift_features], axis=0) + \
    np.nanmean(train[shift_features], axis=0)

In [8]:
# # int shift_features, floor
# int_shift_features = [feature for feature in shift_features if feature in int_cols]

# test[int_shift_features] = test[int_shift_features].fillna(-100)
# test[int_shift_features] = np.floor(test[int_shift_features]).astype(int)

# test[test==-100] = np.nan

In [None]:
# # float shift_features
# float_shift_features = [feature for feature in shift_features if feature not in int_cols]

# test.loc[test["end_year_month"] == test_public_end_year_month, float_shift_features] = \
#     test.loc[test["end_year_month"] == test_public_end_year_month, float_shift_features] - \
#     np.nanmean(test.loc[test["end_year_month"] == test_public_end_year_month, float_shift_features], axis=0) + \
#     np.nanmean(train[float_shift_features], axis=0)

# test.loc[test["end_year_month"] == test_private_end_year_month, float_shift_features] = \
#     test.loc[test["end_year_month"] == test_private_end_year_month, float_shift_features] - \
#     np.nanmean(test.loc[test["end_year_month"] == test_private_end_year_month, float_shift_features], axis=0) + \
#     np.nanmean(train[float_shift_features], axis=0)

# # int shift_features
# int_shift_features = [feature for feature in shift_features if feature in int_cols]

# test.loc[test["end_year_month"] == test_public_end_year_month, int_shift_features] = \
#     test.loc[test["end_year_month"] == test_public_end_year_month, int_shift_features] - \
#     np.floor(np.nanmean(test.loc[test["end_year_month"] == test_public_end_year_month, int_shift_features], axis=0)).astype(int) + \
#     np.floor(np.nanmean(train[int_shift_features], axis=0)).astype(int)

# test.loc[test["end_year_month"] == test_private_end_year_month, int_shift_features] = \
#     test.loc[test["end_year_month"] == test_private_end_year_month, int_shift_features] - \
#     np.floor(np.nanmean(test.loc[test["end_year_month"] == test_private_end_year_month, int_shift_features], axis=0)).astype(int) + \
#     np.floor(np.nanmean(train[int_shift_features], axis=0)).astype(int)

In [None]:
train.to_parquet("../input/amex-data-integer-dtypes-parquet-format/train_shifted.parquet")
test.to_parquet("../input/amex-data-integer-dtypes-parquet-format/test_shifted.parquet")

In [None]:
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train_shifted.parquet")
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test_shifted.parquet")

In [None]:
test_public = test.loc[test["end_year_month"] == test_public_end_year_month]
test_private = test.loc[test["end_year_month"] == test_private_end_year_month]

In [None]:
# outlier_features
all_data = pd.concat([train, test], axis=0)

outlier_features_mean, outlier_features_std = np.nanmean(all_data[outlier_features], axis=0), np.nanstd(all_data[outlier_features], axis=0)

train[outlier_features] = np.clip(train[outlier_features], 
                                  outlier_features_mean - 3 * outlier_features_std, 
                                  outlier_features_mean + 3 * outlier_features_std
                                 )
test[outlier_features] = np.clip(test[outlier_features], 
                                 outlier_features_mean - 3 * outlier_features_std, 
                                 outlier_features_mean + 3 * outlier_features_std
                                )

In [None]:
test["D_109"].value_counts()

In [None]:
train["D_109"].value_counts()

In [None]:
# test_base_outlier_features
test_base_outlier_features_mean, test_base_outlier_features_std = \
    np.nanmean(test[test_base_outlier_features], axis=0), np.nanstd(test[test_base_outlier_features], axis=0)

train[test_base_outlier_features] = np.clip(train[test_base_outlier_features], 
                                  test_base_outlier_features_mean - 3 * test_base_outlier_features_std, 
                                  test_base_outlier_features_mean + 3 * test_base_outlier_features_std
                                 )
test[test_base_outlier_features] = np.clip(test[test_base_outlier_features], 
                                 test_base_outlier_features_mean - 3 * test_base_outlier_features_std, 
                                 test_base_outlier_features_mean + 3 * test_base_outlier_features_std
                                )

In [None]:
# test_public_base_outlier_features
test_public_base_outlier_features_mean, test_public_base_outlier_features_std = \
    np.nanmean(test_public[test_public_base_outlier_features], axis=0), np.nanstd(test_public[test_public_base_outlier_features], axis=0)

train[test_public_base_outlier_features] = np.clip(train[test_public_base_outlier_features], 
                                  test_public_base_outlier_features_mean - 3 * test_public_base_outlier_features_std, 
                                  test_public_base_outlier_features_mean + 3 * test_public_base_outlier_features_std
                                 )
test[test_public_base_outlier_features] = np.clip(test[test_public_base_outlier_features], 
                                 test_public_base_outlier_features_mean - 3 * test_public_base_outlier_features_std, 
                                 test_public_base_outlier_features_mean + 3 * test_public_base_outlier_features_std
                                )

In [None]:
# test_private_base_outlier_features
test_private_base_outlier_features_mean, test_private_base_outlier_features_std = \
    np.nanmean(test_private[test_private_base_outlier_features], axis=0), np.nanstd(test_private[test_private_base_outlier_features], axis=0)

train[test_private_base_outlier_features] = np.clip(train[test_private_base_outlier_features], 
                                  test_private_base_outlier_features_mean - 3 * test_private_base_outlier_features_std, 
                                  test_private_base_outlier_features_mean + 3 * test_private_base_outlier_features_std
                                 )
test[test_private_base_outlier_features] = np.clip(test[test_private_base_outlier_features], 
                                 test_private_base_outlier_features_mean - 3 * test_private_base_outlier_features_std, 
                                 test_private_base_outlier_features_mean + 3 * test_private_base_outlier_features_std
                                )

In [None]:
test_base_outlier_features_mean[1], test_base_outlier_features_std[1]

In [None]:
train.to_parquet("../input/amex-data-integer-dtypes-parquet-format/train_clipped.parquet")
test.to_parquet("../input/amex-data-integer-dtypes-parquet-format/test_clipped.parquet")

# feature engineering

In [None]:
def process_and_feature_engineer(df):

    all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2", "first_occurance", "time_id", "end_year_month"]]
    nan_related_features = [
        "number_of_observations",
        "type_0_nan",
        "type_1_nan",
        "type_2_nan"
    ]
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    num_features = [col for col in all_cols if col not in (cat_features + nan_related_features)]
    
    print("process num features")
    num_agg = df.groupby("customer_ID")[num_features].agg([ 
        np.nanstd, 
        np.nanmin, 
        np.nanmax,
        "last"
    ])
    num_agg.columns = ["_".join(x) for x in num_agg.columns]
    print("num features shape:", num_agg.shape)
    
    print("process sma num features")
    sma_num_agg_0 = df.loc[df["time_id"] >= 0].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_0.columns = [(x + "_nanmean_0") for x in sma_num_agg_0.columns]
    
    sma_num_agg_4 = df.loc[df["time_id"] >= 4].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_4.columns = [(x + "_nanmean_4") for x in sma_num_agg_4.columns]
    
    sma_num_agg_7 = df.loc[df["time_id"] >= 7].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_7.columns = [(x + "_nanmean_7") for x in sma_num_agg_7.columns]
    
    sma_num_agg_10 = df.loc[df["time_id"] >= 10].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_10.columns = [(x + "_nanmean_10") for x in sma_num_agg_10.columns]
    
    sma_num_agg = pd.concat([sma_num_agg_0, sma_num_agg_4, sma_num_agg_7, sma_num_agg_10], axis=1)
    print("sma num features shape:", sma_num_agg.shape)
    
    print("process cat features")
    cat_agg = df.groupby("customer_ID")[cat_features].agg(["count", "last", "nunique"])
    cat_agg.columns = ["_".join(x) for x in cat_agg.columns]
    print("cat features shape:", cat_agg.shape)
    
    df = pd.concat([num_agg, sma_num_agg, cat_agg], axis=1)
    print("shape after engineering", df.shape)
    
    return df

In [None]:
train = process_and_feature_engineer(train)
test = process_and_feature_engineer(test)

In [None]:
train.isnull().sum()

In [None]:
train.head()

# Add target

In [None]:
def add_target(df):
    
    targets = pd.read_csv("../input/train_labels.csv")
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    targets["customer_ID"] = targets["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    targets[["customer_ID"]] = targets[["customer_ID"]].applymap(lambda x: int(x, 16))
    targets["customer_ID"] = targets["customer_ID"].astype("int64")
    
    targets = targets.set_index("customer_ID")
    
    df = df.merge(targets, left_index=True, right_index=True, how="left")
    df.target = df.target.astype("int8")

    # NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
    df = df.sort_index().reset_index()
    
    return df

In [None]:
train = add_target(train)

In [None]:
train.shape

In [None]:
train.head()

# label encoding

In [None]:
def label_encoding(df):
    
    cat_features_base = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ] 
    cat_features = [
        "{}_last".format(feature) for feature in cat_features_base
    ]
    
    for feature in cat_features:
        encoder = LabelEncoder()
        df[feature] = encoder.fit_transform(df[feature])
    
    return df

In [None]:
train = label_encoding(train)
test = label_encoding(test)

# save files

In [None]:
train.to_parquet("../input/train_base_clipped.parquet")
test.to_parquet("../input/test_base_clipped.parquet")