In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from joblib import load, dump

# load files

In [2]:
def load_file(
    path="", 
    usecols=None
):
    # LOAD DATAFRAME
    if usecols is not None: 
        df = pd.read_parquet(path, columns=usecols)
    else: 
        df = pd.read_parquet(path)
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df["customer_ID"] = df["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    df[["customer_ID"]] = df[["customer_ID"]].applymap(lambda x: int(x, 16))
    
    df["customer_ID"] = df["customer_ID"].astype("int64")
    df["S_2"] = pd.to_datetime(df["S_2"])
    
    # SORT BY CUSTOMER AND DATE (so agg("last") works correctly)
    df = df.sort_values(["customer_ID", "S_2"])
    df = df.reset_index(drop=True)
    
    # FILL NAN
    print("shape of data:", df.shape)
    
    return df

In [4]:
print("Reading data...")
TRAIN_PATH = "../input/amex-data-integer-dtypes-parquet-format/train.parquet"
train = load_file(path = TRAIN_PATH)

TEST_PATH = "../input/amex-data-integer-dtypes-parquet-format/test.parquet"
test = load_file(path = TEST_PATH)

Reading data...
shape of data: (5531451, 190)
shape of data: (11363762, 190)


In [5]:
int_cols = train.select_dtypes(include=[np.int8, np.int16, np.int32, np.int64]).columns.tolist()
int_cols = [col for col in int_cols if col not in ["customer_ID"]]
dump(int_cols, "int_cols.pkl")

['int_cols.pkl']

In [6]:
train["D_39"].value_counts()

0      3053463
1       476208
3       103975
2       102479
4       101466
        ...   
178          1
176          1
149          1
142          1
172          1
Name: D_39, Length: 180, dtype: int64

In [7]:
# revert to nan
train[train==-1] = np.nan
test[test==-1] = np.nan

In [8]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,-9223358381327749917,2017-03-31,0.342033,9,0.298571,0.028331,0.506896,0.793958,0.0,0.823765,...,,,,0.0,0.0,0.0,,0.0,0.004787,0.0
1,-9223358381327749917,2017-04-07,0.340178,16,0.353684,0.026975,0.505335,0.795727,0.0,0.825231,...,,,,0.0,0.0,0.0,,0.0,0.003442,0.0
2,-9223358381327749917,2017-05-23,0.35601,1,0.448582,0.026601,0.50629,0.530133,0.0,0.923707,...,,,,0.0,0.0,0.0,,0.0,0.00334,0.0
3,-9223358381327749917,2017-06-22,0.378665,1,0.443752,0.024322,0.509069,0.539285,0.0,0.915724,...,,,,0.0,0.0,0.0,,0.0,0.007556,0.0
4,-9223358381327749917,2017-07-22,0.416543,1,0.463824,0.023064,0.505335,0.461935,0.0,0.919373,...,,,,0.0,0.0,0.0,,0.0,0.005299,0.0


In [9]:
train.shape

(5531451, 190)

# add number of observations

In [10]:
def add_observation(df):
    
    df["number_of_observations"] = df.groupby("customer_ID")["customer_ID"].transform("count")
    
    return df

In [11]:
train = add_observation(train)
test = add_observation(test)

# add first occurance flag

In [12]:
def add_first_occurance(df):
    
    df["index"] = df.index.tolist()
    first_occurance_index = df[["customer_ID", "index"]].groupby("customer_ID").first()["index"].tolist()
    
    df["first_occurance"] = 0
    df.loc[df["index"].isin(first_occurance_index), "first_occurance"] = 1
    
    df = df.drop(["index"], axis=1)
    
    return df

In [13]:
train = add_first_occurance(train)
test = add_first_occurance(test)

# process nan

In [14]:
# get nan clusters first
cols = sorted(train.columns[2:].tolist())
nas = train[cols].isna().sum(axis=0).reset_index(name="NA_count")
nas["group_count"] = nas.loc[nas.NA_count > 0].groupby("NA_count").transform("count")
clusters = nas.loc[nas.group_count > 10].sort_values(["NA_count","index"]).groupby("NA_count")["index"].apply(list).values

In [15]:
clusters[0]

['B_16',
 'B_19',
 'B_2',
 'B_20',
 'B_22',
 'B_26',
 'B_27',
 'B_3',
 'B_30',
 'B_33',
 'B_38',
 'D_41',
 'D_54']

In [16]:
for col in clusters[2]:
    if col in int_cols:
        print(col)

D_113
D_114
D_116
D_117
D_120
D_122
D_123
D_124
D_125


In [17]:
def process_type_0_nan(df, cluster):
    
    type_0_nan_customers = df.loc[df[cluster[0]].isnull(), "customer_ID"].unique().tolist()
    df.loc[df["customer_ID"].isin(type_0_nan_customers), cluster] = df.loc[df["customer_ID"].isin(type_0_nan_customers), cluster].fillna(0)
    
    return df

In [18]:
# train = process_type_0_nan(train, clusters[0])
# test = process_type_0_nan(test, clusters[0])

In [19]:
def process_type_1_nan(df, cluster):
    
    type_1_nan_customers_group_0 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 0), "customer_ID"].unique().tolist()
    type_1_nan_customers_group_1 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 1), "customer_ID"].unique().tolist()
    
    # fill group 1 by 0
    df.loc[df["customer_ID"].isin(type_1_nan_customers_group_1), cluster] = \
        df.loc[df["customer_ID"].isin(type_1_nan_customers_group_1), cluster].fillna(0)
    
    # fill group 0 by mean of t - 1 and t + 1
    ffill = df[["customer_ID"] + cluster].copy()
    bfill = df[["customer_ID"] + cluster].copy()
    
    ffill[cluster] = ffill[cluster].fillna(method="ffill")
    bfill[cluster] = bfill[cluster].fillna(method="bfill")
    
    df.loc[df["customer_ID"].isin(type_1_nan_customers_group_0), cluster] = \
        (ffill.loc[ffill["customer_ID"].isin(type_1_nan_customers_group_0), cluster] + \
         bfill.loc[bfill["customer_ID"].isin(type_1_nan_customers_group_0), cluster]) / 2
    
    return df

In [20]:
# train = process_type_1_nan(train, clusters[1])
# test = process_type_1_nan(test, clusters[1])

In [22]:
def process_type_2_nan(df, cluster):
    
    type_2_nan_customers_group_0 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 0), "customer_ID"].unique().tolist()
    type_2_nan_customers_group_1 = df.loc[(df[cluster[0]].isnull()) & (df["first_occurance"] == 1), "customer_ID"].unique().tolist()
    
    ffill = df[["customer_ID"] + cluster].copy()
    bfill = df[["customer_ID"] + cluster].copy()
    
    ffill[cluster] = ffill[cluster].fillna(method="ffill")
    bfill[cluster] = bfill[cluster].fillna(method="bfill")
    
    # fill group 1 by bfill
    df.loc[df["customer_ID"].isin(type_2_nan_customers_group_1), cluster] = \
        bfill.loc[bfill["customer_ID"].isin(type_2_nan_customers_group_1), cluster]

    # fill group 1 by 0
    df.loc[df["customer_ID"].isin(type_2_nan_customers_group_1), cluster] = \
        df.loc[df["customer_ID"].isin(type_2_nan_customers_group_1), cluster].fillna(0)
    
    # fill group 0 by mean of fill and bfill
    df.loc[df["customer_ID"].isin(type_2_nan_customers_group_0), cluster] = \
        (ffill.loc[ffill["customer_ID"].isin(type_2_nan_customers_group_0), cluster] + \
         bfill.loc[bfill["customer_ID"].isin(type_2_nan_customers_group_0), cluster]) / 2
    
    return df

In [23]:
# train = process_type_2_nan(train, clusters[2])
# test = process_type_2_nan(test, clusters[2])

# add time id

In [24]:
def add_time_id(df):
    
    df["time_id"] = df.groupby(["customer_ID"]).cumcount()
    
    return df

In [25]:
train = add_time_id(train)
test = add_time_id(test)

# add end_year_month

In [26]:
def add_end_year_month(df):
    
    df["end_year_month"] = df["S_2"].dt.to_period("M")
    df["end_year_month"] = df.groupby("customer_ID")["end_year_month"].transform("last")
    
    return df

In [27]:
train = add_end_year_month(train)
test = add_end_year_month(test)

In [28]:
test["end_year_month"] .value_counts()

2019-04    5719469
2019-10    5644293
Freq: M, Name: end_year_month, dtype: int64

In [30]:
# train.to_parquet("../input/amex-data-integer-dtypes-parquet-format/train_fillna.parquet")
# test.to_parquet("../input/amex-data-integer-dtypes-parquet-format/test_fillna.parquet")

In [32]:
train.to_parquet("../input/amex-data-integer-dtypes-parquet-format/train_base.parquet")
test.to_parquet("../input/amex-data-integer-dtypes-parquet-format/test_base.parquet")

# feature adjustment

In [33]:
int_cols = load("int_cols.pkl")

In [None]:
# train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train_fillna.parquet")
# test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test_fillna.parquet")

In [58]:
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train_base.parquet")
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test_base.parquet")

In [59]:
shift_features = [
    "D_42",
    "D_52",
    "D_59",
    "D_79",
    "D_93",
    "D_105",
    "D_116",
    "D_122",
    "D_130",
    "D_133",
    "D_142",
    "S_11",
    "B_36"
]

outlier_features = [
    "D_106",
    "S_23",
    "B_10",
]
outlier_features = [feature for feature in outlier_features if feature not in int_cols]

test_base_outlier_features = [
    "D_102",
    "D_109",
    "D_144",
    "B_6",
    "B_40"
]
test_base_outlier_features = [feature for feature in test_base_outlier_features if feature not in int_cols]

test_public_base_outlier_features = [
    "D_69"
]
test_public_base_outlier_features = [feature for feature in test_public_base_outlier_features if feature not in int_cols]

test_private_base_outlier_features = [
    "S_18"
]
test_private_base_outlier_features = [feature for feature in test_private_base_outlier_features if feature not in int_cols]

In [60]:
test_public_end_year_month = test["end_year_month"].iloc[0]
test_private_end_year_month = test["end_year_month"].iloc[-1]

In [61]:
for col in shift_features:
    if col in int_cols:
        print(col)

D_59
D_79
D_93
D_116
D_122
S_11


In [62]:
# shift features
test.loc[test["end_year_month"] == test_public_end_year_month, shift_features] = \
    test.loc[test["end_year_month"] == test_public_end_year_month, shift_features] - \
    np.nanmean(test.loc[test["end_year_month"] == test_public_end_year_month, shift_features], axis=0) + \
    np.nanmean(train[shift_features], axis=0)

test.loc[test["end_year_month"] == test_private_end_year_month, shift_features] = \
    test.loc[test["end_year_month"] == test_private_end_year_month, shift_features] - \
    np.nanmean(test.loc[test["end_year_month"] == test_private_end_year_month, shift_features], axis=0) + \
    np.nanmean(train[shift_features], axis=0)

In [None]:
# # int shift_features, floor
# int_shift_features = [feature for feature in shift_features if feature in int_cols]

# test[int_shift_features] = test[int_shift_features].fillna(-100)
# test[int_shift_features] = np.floor(test[int_shift_features]).astype(int)

# test[test==-100] = np.nan

In [None]:
# # float shift_features
# float_shift_features = [feature for feature in shift_features if feature not in int_cols]

# test.loc[test["end_year_month"] == test_public_end_year_month, float_shift_features] = \
#     test.loc[test["end_year_month"] == test_public_end_year_month, float_shift_features] - \
#     np.nanmean(test.loc[test["end_year_month"] == test_public_end_year_month, float_shift_features], axis=0) + \
#     np.nanmean(train[float_shift_features], axis=0)

# test.loc[test["end_year_month"] == test_private_end_year_month, float_shift_features] = \
#     test.loc[test["end_year_month"] == test_private_end_year_month, float_shift_features] - \
#     np.nanmean(test.loc[test["end_year_month"] == test_private_end_year_month, float_shift_features], axis=0) + \
#     np.nanmean(train[float_shift_features], axis=0)

# # int shift_features
# int_shift_features = [feature for feature in shift_features if feature in int_cols]

# test.loc[test["end_year_month"] == test_public_end_year_month, int_shift_features] = \
#     test.loc[test["end_year_month"] == test_public_end_year_month, int_shift_features] - \
#     np.floor(np.nanmean(test.loc[test["end_year_month"] == test_public_end_year_month, int_shift_features], axis=0)).astype(int) + \
#     np.floor(np.nanmean(train[int_shift_features], axis=0)).astype(int)

# test.loc[test["end_year_month"] == test_private_end_year_month, int_shift_features] = \
#     test.loc[test["end_year_month"] == test_private_end_year_month, int_shift_features] - \
#     np.floor(np.nanmean(test.loc[test["end_year_month"] == test_private_end_year_month, int_shift_features], axis=0)).astype(int) + \
#     np.floor(np.nanmean(train[int_shift_features], axis=0)).astype(int)

In [63]:
train.to_parquet("../input/amex-data-integer-dtypes-parquet-format/train_shifted.parquet")
test.to_parquet("../input/amex-data-integer-dtypes-parquet-format/test_shifted.parquet")

In [None]:
# train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train_shifted.parquet")
# test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test_shifted.parquet")

In [38]:
# test_public = test.loc[test["end_year_month"] == test_public_end_year_month]
# test_private = test.loc[test["end_year_month"] == test_private_end_year_month]

In [39]:
# # outlier_features
# all_data = pd.concat([train, test], axis=0)

# outlier_features_mean, outlier_features_std = np.nanmean(all_data[outlier_features], axis=0), np.nanstd(all_data[outlier_features], axis=0)

# train[outlier_features] = np.clip(train[outlier_features], 
#                                   outlier_features_mean - 3 * outlier_features_std, 
#                                   outlier_features_mean + 3 * outlier_features_std
#                                  )
# test[outlier_features] = np.clip(test[outlier_features], 
#                                  outlier_features_mean - 3 * outlier_features_std, 
#                                  outlier_features_mean + 3 * outlier_features_std
#                                 )

In [42]:
# # test_base_outlier_features
# test_base_outlier_features_mean, test_base_outlier_features_std = \
#     np.nanmean(test[test_base_outlier_features], axis=0), np.nanstd(test[test_base_outlier_features], axis=0)

# train[test_base_outlier_features] = np.clip(train[test_base_outlier_features], 
#                                   test_base_outlier_features_mean - 3 * test_base_outlier_features_std, 
#                                   test_base_outlier_features_mean + 3 * test_base_outlier_features_std
#                                  )
# test[test_base_outlier_features] = np.clip(test[test_base_outlier_features], 
#                                  test_base_outlier_features_mean - 3 * test_base_outlier_features_std, 
#                                  test_base_outlier_features_mean + 3 * test_base_outlier_features_std
#                                 )

In [43]:
# # test_public_base_outlier_features
# test_public_base_outlier_features_mean, test_public_base_outlier_features_std = \
#     np.nanmean(test_public[test_public_base_outlier_features], axis=0), np.nanstd(test_public[test_public_base_outlier_features], axis=0)

# train[test_public_base_outlier_features] = np.clip(train[test_public_base_outlier_features], 
#                                   test_public_base_outlier_features_mean - 3 * test_public_base_outlier_features_std, 
#                                   test_public_base_outlier_features_mean + 3 * test_public_base_outlier_features_std
#                                  )
# test[test_public_base_outlier_features] = np.clip(test[test_public_base_outlier_features], 
#                                  test_public_base_outlier_features_mean - 3 * test_public_base_outlier_features_std, 
#                                  test_public_base_outlier_features_mean + 3 * test_public_base_outlier_features_std
#                                 )

In [44]:
# # test_private_base_outlier_features
# test_private_base_outlier_features_mean, test_private_base_outlier_features_std = \
#     np.nanmean(test_private[test_private_base_outlier_features], axis=0), np.nanstd(test_private[test_private_base_outlier_features], axis=0)

# train[test_private_base_outlier_features] = np.clip(train[test_private_base_outlier_features], 
#                                   test_private_base_outlier_features_mean - 3 * test_private_base_outlier_features_std, 
#                                   test_private_base_outlier_features_mean + 3 * test_private_base_outlier_features_std
#                                  )
# test[test_private_base_outlier_features] = np.clip(test[test_private_base_outlier_features], 
#                                  test_private_base_outlier_features_mean - 3 * test_private_base_outlier_features_std, 
#                                  test_private_base_outlier_features_mean + 3 * test_private_base_outlier_features_std
#                                 )

In [45]:
# test_base_outlier_features_mean[1], test_base_outlier_features_std[1]

(0.051987506, 0.18215398)

In [46]:
# train.to_parquet("../input/amex-data-integer-dtypes-parquet-format/train_clipped.parquet")
# test.to_parquet("../input/amex-data-integer-dtypes-parquet-format/test_clipped.parquet")