In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# load files

In [None]:
def load_file(
    path="", 
    usecols=None
):
    # LOAD DATAFRAME
    if usecols is not None: 
        df = pd.read_parquet(path, columns=usecols)
    else: 
        df = pd.read_parquet(path)
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df["customer_ID"] = df["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    df[["customer_ID"]] = df[["customer_ID"]].applymap(lambda x: int(x, 16))
    
    df["customer_ID"] = df["customer_ID"].astype("int64")
    df["S_2"] = pd.to_datetime(df["S_2"])
    
    # SORT BY CUSTOMER AND DATE (so agg("last") works correctly)
    df = df.sort_values(["customer_ID", "S_2"])
    df = df.reset_index(drop=True)
    
    # FILL NAN
    print("shape of data:", df.shape)
    
    return df

In [None]:
print("Reading data...")
TRAIN_PATH = "../input/amex-data-integer-dtypes-parquet-format/train.parquet"
train = load_file(path = TRAIN_PATH)

TEST_PATH = "../input/amex-data-integer-dtypes-parquet-format/test.parquet"
test = load_file(path = TEST_PATH)

In [None]:
# revert to nan
train[train==-1] = np.nan
test[test==-1] = np.nan

In [None]:
train.head()

In [None]:
train.shape

# add number of observations

In [None]:
def add_observation(df):
    
    df["number_of_observations"] = df.groupby("customer_ID")["customer_ID"].transform("count")
    
    return df

In [None]:
train = add_observation(train)
test = add_observation(test)

# add first occurance flag

In [None]:
def add_first_occurance(df):
    
    df["index"] = df.index.tolist()
    first_occurance_index = df[["customer_ID", "index"]].groupby("customer_ID").first()["index"].tolist()
    
    df["first_occurance"] = 0
    df.loc[df["index"].isin(first_occurance_index), "first_occurance"] = 1
    
    df = df.drop(["index"], axis=1)
    
    return df

In [None]:
train = add_first_occurance(train)
test = add_first_occurance(test)

# process nan

In [None]:
# get nan clusters first
cols = sorted(train.columns[2:].tolist())
nas = train[cols].isna().sum(axis=0).reset_index(name="NA_count")
nas["group_count"] = nas.loc[nas.NA_count > 0].groupby("NA_count").transform("count")
clusters = nas.loc[nas.group_count > 10].sort_values(["NA_count","index"]).groupby("NA_count")["index"].apply(list).values

In [None]:
def process_type_0_nan(df, cluster):
    
    df["type_0_nan"] = 0
    df.loc[df[cluster[0]].isnull(), "type_0_nan"] = 1
    df.loc[df["type_0_nan"] == 1, cluster] = df.loc[df["type_0_nan"] == 1, cluster].fillna(0)
    
    return df

In [None]:
train = process_type_0_nan(train, clusters[0])
test = process_type_0_nan(test, clusters[0])

In [None]:
def process_type_1_nan(df, cluster):
    
    df["type_1_nan"] = 0
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 1), 
        "type_1_nan"
    ] = 1
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 0), 
        "type_1_nan"
    ] = 2
    
    # fill type_1_nan == 1 by 0
    df.loc[df["type_1_nan"] == 1, cluster] = df.loc[df["type_1_nan"] == 1, cluster].fillna(0)
    
    # fill type_1_nan == 0 by mean of t - 1 and t + 1
    ffill = df[["customer_ID", "type_1_nan"] + cluster].copy()
    bfill = df[["customer_ID", "type_1_nan"] + cluster].copy()
    
    ffill[cluster] = ffill[cluster].fillna(method="ffill")
    bfill[cluster] = bfill[cluster].fillna(method="bfill")
    
    df.loc[df["type_1_nan"] == 2, cluster] = (ffill.loc[ffill["type_1_nan"] == 2, cluster] + \
                                              bfill.loc[ffill["type_1_nan"] == 2, cluster]) / 2
    
    return df

In [None]:
train = process_type_1_nan(train, clusters[1])
test = process_type_1_nan(test, clusters[1])

In [None]:
def process_type_2_nan(df, cluster):
    
    df["type_2_nan"] = 0
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 1), 
        "type_2_nan"
    ] = 1
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 0), 
        "type_2_nan"
    ] = 2
    
    return df

In [None]:
train = process_type_2_nan(train, clusters[2])
test = process_type_2_nan(test, clusters[2])

# add time id

In [None]:
def add_time_id(df):
    
    df["time_id"] = df.groupby(["customer_ID"]).cumcount()
    
    return df

In [None]:
train = add_time_id(train)
test = add_time_id(test)

# add end_year_month

In [None]:
def add_end_year_month(df):
    
    df["end_year_month"] = df["S_2"].dt.to_period("M")
    df["end_year_month"] = df.groupby("customer_ID")["end_year_month"].transform("last")
    
    return df

In [None]:
train = add_end_year_month(train)
test = add_end_year_month(test)

In [None]:
test["end_year_month"] .value_counts()

# feature normalize by dataset

In [2]:
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train_fillna.parquet")
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test_fillna.parquet")

In [3]:
all_cols = [c for c in list(train.columns) if c not in ["customer_ID", "S_2", "first_occurance", "time_id", "end_year_month"]]
nan_related_features = [
    "number_of_observations",
    "type_0_nan",
    "type_1_nan",
    "type_2_nan"
]
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]
num_features = [col for col in all_cols if col not in (cat_features + nan_related_features)]

In [4]:
len(num_features)

177

In [5]:
test_public_end_year_month = test["end_year_month"].iloc[0]
test_private_end_year_month = test["end_year_month"].iloc[-1]

In [6]:
train[num_features] -= train.groupby("customer_ID")[num_features].transform("first")

test.loc[test["end_year_month"] == test_public_end_year_month, num_features] -= \
    test.loc[test["end_year_month"] == test_public_end_year_month].groupby("customer_ID")[num_features].transform("first")

test.loc[test["end_year_month"] == test_private_end_year_month, num_features] -= \
    test.loc[test["end_year_month"] == test_private_end_year_month].groupby("customer_ID")[num_features].transform("first")

# feature engineering

In [7]:
def process_and_feature_engineer(df):

    all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2", "first_occurance", "time_id", "end_year_month"]]
    nan_related_features = [
        "number_of_observations",
        "type_0_nan",
        "type_1_nan",
        "type_2_nan"
    ]
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    num_features = [col for col in all_cols if col not in (cat_features + nan_related_features)]
    
    print("process num features")
    num_agg = df.groupby("customer_ID")[num_features].agg([ 
        np.nanstd, 
        np.nanmin, 
        np.nanmax,
        "last"
    ])
    num_agg.columns = ["_".join(x) for x in num_agg.columns]
    print("num features shape:", num_agg.shape)
    
    print("process sma num features")
    sma_num_agg_0 = df.loc[df["time_id"] >= 0].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_0.columns = [(x + "_nanmean_0") for x in sma_num_agg_0.columns]
    
    sma_num_agg_4 = df.loc[df["time_id"] >= 4].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_4.columns = [(x + "_nanmean_4") for x in sma_num_agg_4.columns]
    
    sma_num_agg_7 = df.loc[df["time_id"] >= 7].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_7.columns = [(x + "_nanmean_7") for x in sma_num_agg_7.columns]
    
    sma_num_agg_10 = df.loc[df["time_id"] >= 10].groupby("customer_ID")[num_features].agg(np.nanmean)
    sma_num_agg_10.columns = [(x + "_nanmean_10") for x in sma_num_agg_10.columns]
    
    sma_num_agg = pd.concat([sma_num_agg_0, sma_num_agg_4, sma_num_agg_7, sma_num_agg_10], axis=1)
    print("sma num features shape:", sma_num_agg.shape)
    
    print("process cat features")
    cat_agg = df.groupby("customer_ID")[cat_features].agg(["count", "last", "nunique"])
    cat_agg.columns = ["_".join(x) for x in cat_agg.columns]
    print("cat features shape:", cat_agg.shape)
    
    df = pd.concat([num_agg, sma_num_agg, cat_agg], axis=1)
    print("shape after engineering", df.shape)
    
    return df

In [8]:
train = process_and_feature_engineer(train)
test = process_and_feature_engineer(test)

process num features
num features shape: (458913, 708)
process sma num features
sma num features shape: (458913, 708)
process cat features
cat features shape: (458913, 33)
shape after engineering (458913, 1449)
process num features
num features shape: (924621, 708)
process sma num features
sma num features shape: (924621, 708)
process cat features
cat features shape: (924621, 33)
shape after engineering (924621, 1449)


In [9]:
train.isnull().sum()

P_2_nanstd        7829
P_2_nanmin        2434
P_2_nanmax        2434
P_2_last          2434
D_39_nanstd       5120
                 ...  
D_66_last       399137
D_66_nunique         0
D_68_count           0
D_68_last         5251
D_68_nunique         0
Length: 1449, dtype: int64

In [10]:
train.head()

Unnamed: 0_level_0,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,B_1_nanstd,B_1_nanmin,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223358381327749917,0.057145,-0.001855,0.156693,0.045674,4.628507,-9,7,-9,0.048472,0.0,...,1,13,2.0,1,0,,0,13,3.0,2
-9223193039457028513,0.013094,-0.014827,0.023167,0.022062,0.0,0,0,0,0.001941,-0.004207,...,2,13,0.0,1,0,,0,13,6.0,1
-9223189665817919541,0.038025,-0.118679,0.01601,-0.118679,0.0,0,0,0,0.002724,-0.005812,...,1,13,0.0,1,0,,0,13,6.0,1
-9223188534444851899,0.002688,-0.007564,0.000614,-0.006266,0.0,0,0,0,0.00257,-0.003348,...,1,13,3.0,2,0,,0,13,5.0,1
-9223173911659837606,0.078554,-0.214214,0.0,-0.211935,6.144625,0,17,13,0.005226,-0.003232,...,1,13,0.0,2,0,,0,13,6.0,2


# Add target

In [11]:
def add_target(df):
    
    targets = pd.read_csv("../input/train_labels.csv")
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    targets["customer_ID"] = targets["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    targets[["customer_ID"]] = targets[["customer_ID"]].applymap(lambda x: int(x, 16))
    targets["customer_ID"] = targets["customer_ID"].astype("int64")
    
    targets = targets.set_index("customer_ID")
    
    df = df.merge(targets, left_index=True, right_index=True, how="left")
    df.target = df.target.astype("int8")

    # NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
    df = df.sort_index().reset_index()
    
    return df

In [12]:
train = add_target(train)

In [13]:
train.shape

(458913, 1451)

In [14]:
train.head()

Unnamed: 0,customer_ID,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,B_1_nanstd,...,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target
0,-9223358381327749917,0.057145,-0.001855,0.156693,0.045674,4.628507,-9,7,-9,0.048472,...,13,2.0,1,0,,0,13,3.0,2,1
1,-9223193039457028513,0.013094,-0.014827,0.023167,0.022062,0.0,0,0,0,0.001941,...,13,0.0,1,0,,0,13,6.0,1,0
2,-9223189665817919541,0.038025,-0.118679,0.01601,-0.118679,0.0,0,0,0,0.002724,...,13,0.0,1,0,,0,13,6.0,1,0
3,-9223188534444851899,0.002688,-0.007564,0.000614,-0.006266,0.0,0,0,0,0.00257,...,13,3.0,2,0,,0,13,5.0,1,0
4,-9223173911659837606,0.078554,-0.214214,0.0,-0.211935,6.144625,0,17,13,0.005226,...,13,0.0,2,0,,0,13,6.0,2,1


# label encoding

In [15]:
def label_encoding(df):
    
    cat_features_base = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ] 
    cat_features = [
        "{}_last".format(feature) for feature in cat_features_base
    ]
    
    for feature in cat_features:
        encoder = LabelEncoder()
        df[feature] = encoder.fit_transform(df[feature])
    
    return df

In [16]:
train = label_encoding(train)
test = label_encoding(test)

# save files

In [17]:
train.to_parquet("../input/train_base_normalized.parquet")
test.to_parquet("../input/test_base_normalized.parquet")