In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# load files

In [2]:
def load_file(
    path="", 
    usecols=None
):
    # LOAD DATAFRAME
    if usecols is not None: 
        df = pd.read_parquet(path, columns=usecols)
    else: 
        df = pd.read_parquet(path)
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df["customer_ID"] = df["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    df[["customer_ID"]] = df[["customer_ID"]].applymap(lambda x: int(x, 16))
    
    df["customer_ID"] = df["customer_ID"].astype("int64")
    df["S_2"] = pd.to_datetime(df["S_2"])
    
    # SORT BY CUSTOMER AND DATE (so agg("last") works correctly)
    df = df.sort_values(["customer_ID", "S_2"])
    df = df.reset_index(drop=True)
    
    # FILL NAN
    print("shape of data:", df.shape)
    
    return df

In [3]:
print("Reading train data...")
TRAIN_PATH = "../input/amex-data-integer-dtypes-parquet-format/train.parquet"
train = load_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [4]:
# revert to nan
train[train==-1] = np.nan

In [5]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,-9223358381327749917,2017-03-31,0.342033,9,0.298571,0.028331,0.506896,0.793958,0.0,0.823765,...,,,,0.0,0.0,0.0,,0.0,0.004787,0.0
1,-9223358381327749917,2017-04-07,0.340178,16,0.353684,0.026975,0.505335,0.795727,0.0,0.825231,...,,,,0.0,0.0,0.0,,0.0,0.003442,0.0
2,-9223358381327749917,2017-05-23,0.35601,1,0.448582,0.026601,0.50629,0.530133,0.0,0.923707,...,,,,0.0,0.0,0.0,,0.0,0.00334,0.0
3,-9223358381327749917,2017-06-22,0.378665,1,0.443752,0.024322,0.509069,0.539285,0.0,0.915724,...,,,,0.0,0.0,0.0,,0.0,0.007556,0.0
4,-9223358381327749917,2017-07-22,0.416543,1,0.463824,0.023064,0.505335,0.461935,0.0,0.919373,...,,,,0.0,0.0,0.0,,0.0,0.005299,0.0


In [6]:
train.shape

(5531451, 190)

# add number of observations

In [7]:
def add_observation(df):
    
    df["number_of_observations"] = df.groupby("customer_ID")["customer_ID"].transform("count")
    
    return df

In [8]:
train = add_observation(train)

# add first occurance flag

In [9]:
def add_first_occurance(df):
    
    df["index"] = df.index.tolist()
    first_occurance_index = df[["customer_ID", "index"]].groupby("customer_ID").first()["index"].tolist()
    
    df["first_occurance"] = 0
    df.loc[df["index"].isin(first_occurance_index), "first_occurance"] = 1
    
    df = df.drop(["index"], axis=1)
    
    return df

In [10]:
train = add_first_occurance(train)

# process nan

In [11]:
# get nan clusters first
cols = sorted(train.columns[2:].tolist())
nas = train[cols].isna().sum(axis=0).reset_index(name="NA_count")
nas["group_count"] = nas.loc[nas.NA_count > 0].groupby("NA_count").transform("count")
clusters = nas.loc[nas.group_count > 10].sort_values(["NA_count","index"]).groupby("NA_count")["index"].apply(list).values

In [12]:
def process_type_0_nan(df, cluster):
    
    df["type_0_nan"] = 0
    df.loc[df[cluster[0]].isnull(), "type_0_nan"] = 1
    df.loc[df["type_0_nan"] == 1, cluster] = df.loc[df["type_0_nan"] == 1, cluster].fillna(0)
    
    return df

In [13]:
train = process_type_0_nan(train, clusters[0])

In [14]:
def process_type_1_nan(df, cluster):
    
    df["type_1_nan"] = 0
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 1), 
        "type_1_nan"
    ] = 1
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 0), 
        "type_1_nan"
    ] = 2
    
    # fill type_1_nan == 1 by 0
    df.loc[df["type_1_nan"] == 1, cluster] = df.loc[df["type_1_nan"] == 1, cluster].fillna(0)
    
    # fill type_1_nan == 0 by mean of t - 1 and t + 1
    ffill = df[["customer_ID", "type_1_nan"] + cluster].copy()
    bfill = df[["customer_ID", "type_1_nan"] + cluster].copy()
    
    ffill[cluster] = ffill[cluster].fillna(method="ffill")
    bfill[cluster] = bfill[cluster].fillna(method="bfill")
    
    df.loc[df["type_1_nan"] == 2, cluster] = (ffill.loc[ffill["type_1_nan"] == 2, cluster] + \
                                              bfill.loc[ffill["type_1_nan"] == 2, cluster]) / 2
    
    return df

In [15]:
train = process_type_1_nan(train, clusters[1])

In [16]:
def process_type_2_nan(df, cluster):
    
    df["type_2_nan"] = 0
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 1), 
        "type_2_nan"
    ] = 1
    df.loc[
        (df[cluster[0]].isnull()) & (df["first_occurance"] == 0), 
        "type_2_nan"
    ] = 2
    
    return df

In [17]:
train = process_type_2_nan(train, clusters[2])

# add time_id

In [26]:
def add_time_id(df):
    
    df["time_id"] = df.groupby(["customer_ID"]).cumcount()
    
    return df

In [None]:
train = add_time_id(train)

# feature engineering

In [None]:
def process_and_feature_engineer(df):

    all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2", "first_occurance"]]
    nan_related_features = [
        "number_of_observations",
        "type_0_nan",
        "type_1_nan",
        "type_2_nan"
    ]
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    num_features = [col for col in all_cols if col not in (cat_features + nan_related_features)]
    
#     print("process nan related features")
#     type_2_nan_count = df[["customer_ID", clusters[2][0]]].groupby("customer_ID").count().rsub(
#         df[["customer_ID", clusters[2][0]]].groupby("customer_ID").size(), axis=0).rename(columns={clusters[2][0] : "type_2_nan_count"})
    
#     nan_related = df.groupby("customer_ID")[nan_related_features].agg(np.max)
#     nan_related =  pd.concat([nan_related, type_2_nan_count], axis=1)
#     print("nan related features shape:", nan_related.shape)
    
    print("process num features")
    num_agg = df.groupby("customer_ID")[num_features].agg([
        np.nanmean, 
        np.nanstd, 
        np.nanmin, 
        np.nanmax, 
        "last"
    ])
    num_agg.columns = ["_".join(x) for x in num_agg.columns]
    print("num features shape:", num_agg.shape)
    
    print("process num diff features")
    diff = df.groupby("customer_ID")[num_features].agg("diff")
    num_diff_features = ["{}_diff".format(x) for x in diff.columns]
    diff.columns = num_diff_features
    diff = pd.concat([df["customer_ID"], diff], axis=1)
    
    num_dff_agg = diff.groupby("customer_ID")[num_diff_features].agg([
        np.nanmean, 
        np.nanstd, 
        np.nanmin, 
        np.nanmax, 
    ])
    num_dff_agg.columns = ["_".join(x) for x in num_dff_agg.columns]
    print("num diff features shape:", num_dff_agg.shape)
    
    print("process cat features")
    cat_agg = df.groupby("customer_ID")[cat_features].agg(["count", "last", "nunique"])
    cat_agg.columns = ["_".join(x) for x in cat_agg.columns]
    print("cat features shape:", cat_agg.shape)

    df = pd.concat([nan_related, num_agg, num_dff_agg, cat_agg], axis=1)
    print("shape after engineering", df.shape)
    
    return df

In [None]:
train = process_and_feature_engineer(train)

In [None]:
train.isnull().sum()

In [None]:
train.head()

# Add target

In [None]:
def add_target(df):
    
    targets = pd.read_csv("../input/train_labels.csv")
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    targets["customer_ID"] = targets["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    targets[["customer_ID"]] = targets[["customer_ID"]].applymap(lambda x: int(x, 16))
    targets["customer_ID"] = targets["customer_ID"].astype("int64")
    
    targets = targets.set_index("customer_ID")
    
    df = df.merge(targets, left_index=True, right_index=True, how="left")
    df.target = df.target.astype("int8")

    # NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
    df = df.sort_index().reset_index()
    
    return df

In [None]:
train = add_target(train)

In [None]:
train.shape

In [None]:
train.head()

# label encoding

In [None]:
def label_encoding(df):
    
    cat_features_base = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ] 
    cat_features = [
        "{}_last".format(feature) for feature in cat_features_base
    ]
    
    for feature in cat_features:
        encoder = LabelEncoder()
        df[feature] = encoder.fit_transform(df[feature])
    
    return df

In [None]:
train = label_encoding(train)

# save files

In [None]:
train.to_parquet("../input/train_sma.parquet")