In [1]:
import os
import gc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# load files

In [2]:
def load_file(
    path="", 
    usecols=None
):
    # LOAD DATAFRAME
    if usecols is not None: 
        df = pd.read_parquet(path, columns=usecols)
    else: 
        df = pd.read_parquet(path)
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df["customer_ID"] = df["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    df[["customer_ID"]] = df[["customer_ID"]].applymap(lambda x: int(x, 16))
    
    df["customer_ID"] = df["customer_ID"].astype("int64")
    df["S_2"] = pd.to_datetime(df["S_2"])
    
    # SORT BY CUSTOMER AND DATE (so agg("last") works correctly)
    df = df.sort_values(["customer_ID", "S_2"])
    df = df.reset_index(drop=True)
    
    # FILL NAN
    df = df.fillna(-127) 
    print("shape of data:", df.shape)
    
    return df

In [3]:
print("Reading train data...")
TRAIN_PATH = "../input/amex-data-integer-dtypes-parquet-format/train.parquet"
train = load_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [4]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,-9223358381327749917,2017-03-31,0.342033,9,0.298571,0.028331,0.506896,0.793958,0.0,0.823765,...,-1,-1,-1,0,0,0.0,-127.0,0,0.004787,0
1,-9223358381327749917,2017-04-07,0.340178,16,0.353684,0.026975,0.505335,0.795727,0.0,0.825231,...,-1,-1,-1,0,0,0.0,-127.0,0,0.003442,0
2,-9223358381327749917,2017-05-23,0.35601,1,0.448582,0.026601,0.50629,0.530133,0.0,0.923707,...,-1,-1,-1,0,0,0.0,-127.0,0,0.00334,0
3,-9223358381327749917,2017-06-22,0.378665,1,0.443752,0.024322,0.509069,0.539285,0.0,0.915724,...,-1,-1,-1,0,0,0.0,-127.0,0,0.007556,0
4,-9223358381327749917,2017-07-22,0.416543,1,0.463824,0.023064,0.505335,0.461935,0.0,0.919373,...,-1,-1,-1,0,0,0.0,-127.0,0,0.005299,0


In [5]:
train.shape

(5531451, 190)

# feature engineering

In [6]:
def process_and_feature_engineer(df):

    all_cols = [c for c in list(df.columns) if c not in ["customer_ID", "S_2"]]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]
    
    print("process num features")
    num_agg = df.groupby("customer_ID")[num_features].agg([
        np.nanmean, 
        np.nanstd, 
        np.nanmin, 
        np.nanmax, 
        "last"
    ])
    num_agg.columns = ["_".join(x) for x in num_agg.columns]
    print("num features shape:", num_agg.shape)
    
    print("process num diff features")
    diff = df.groupby("customer_ID")[num_features].agg("diff")
    num_diff_features = ["{}_diff".format(x) for x in diff.columns]
    diff.columns = num_diff_features
    diff = pd.concat([df["customer_ID"], diff], axis=1)
    
    num_dff_agg = diff.groupby("customer_ID")[num_diff_features].agg([
        np.nanmean, 
        np.nanstd, 
        np.nanmin, 
        np.nanmax, 
        np.nansum
    ])
    num_dff_agg.columns = ["_".join(x) for x in num_dff_agg.columns]
    print("num diff features shape:", num_dff_agg.shape)
    
    print("process cat features")
    cat_agg = df.groupby("customer_ID")[cat_features].agg(["count", "last", "nunique"])
    cat_agg.columns = ["_".join(x) for x in cat_agg.columns]
    print("cat features shape:", cat_agg.shape)

    df = pd.concat([num_agg, num_dff_agg, cat_agg], axis=1)
    print("shape after engineering", df.shape )
    
    return df

In [7]:
train = process_and_feature_engineer(train)

process num features
num features shape: (458913, 885)
process num diff features
num diff features shape: (458913, 885)
process cat features
cat features shape: (458913, 33)
shape after engineering (458913, 1803)


In [8]:
train.isnull().sum()

P_2_nanmean        0
P_2_nanstd      5120
P_2_nanmin         0
P_2_nanmax         0
P_2_last           0
                ... 
D_66_last          0
D_66_nunique       0
D_68_count         0
D_68_last          0
D_68_nunique       0
Length: 1803, dtype: int64

In [9]:
train.head()

Unnamed: 0_level_0,P_2_nanmean,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanmean,D_39_nanstd,D_39_nanmin,D_39_nanmax,D_39_last,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223358381327749917,0.415868,0.057145,0.340178,0.498727,0.387708,2.615385,4.628507,0,16,0,...,1,13,2,1,13,-1,1,13,3,2
-9223193039457028513,0.974068,0.013094,0.964483,1.002478,1.001372,0.0,0.0,0,0,0,...,2,13,0,1,13,-1,1,13,6,1
-9223189665817919541,0.802447,0.038025,0.694073,0.828761,0.694073,0.0,0.0,0,0,0,...,1,13,0,1,13,-1,1,13,6,1
-9223188534444851899,0.791203,0.002688,0.786647,0.794826,0.787945,0.0,0.0,0,0,0,...,1,13,3,2,13,-1,1,13,5,1
-9223173911659837606,0.115666,0.078554,0.038207,0.252421,0.040486,4.384615,6.144625,0,17,13,...,1,13,0,2,13,-1,1,13,6,2


# Add target

In [10]:
def add_target(df):
    
    targets = pd.read_csv("../input/train_labels.csv")
    
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    targets["customer_ID"] = targets["customer_ID"].str[-16:]
    
    hex_to_int = lambda x: int(x, 16)
    targets[["customer_ID"]] = targets[["customer_ID"]].applymap(lambda x: int(x, 16))
    targets["customer_ID"] = targets["customer_ID"].astype("int64")
    
    targets = targets.set_index("customer_ID")
    
    df = df.merge(targets, left_index=True, right_index=True, how="left")
    df.target = df.target.astype("int8")

    # NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
    df = df.sort_index().reset_index()
    
    return df

In [11]:
train = add_target(train)

In [12]:
train.shape

(458913, 1805)

In [13]:
train.head()

Unnamed: 0,customer_ID,P_2_nanmean,P_2_nanstd,P_2_nanmin,P_2_nanmax,P_2_last,D_39_nanmean,D_39_nanstd,D_39_nanmin,D_39_nanmax,...,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique,target
0,-9223358381327749917,0.415868,0.057145,0.340178,0.498727,0.387708,2.615385,4.628507,0,16,...,13,2,1,13,-1,1,13,3,2,1
1,-9223193039457028513,0.974068,0.013094,0.964483,1.002478,1.001372,0.0,0.0,0,0,...,13,0,1,13,-1,1,13,6,1,0
2,-9223189665817919541,0.802447,0.038025,0.694073,0.828761,0.694073,0.0,0.0,0,0,...,13,0,1,13,-1,1,13,6,1,0
3,-9223188534444851899,0.791203,0.002688,0.786647,0.794826,0.787945,0.0,0.0,0,0,...,13,3,2,13,-1,1,13,5,1,0
4,-9223173911659837606,0.115666,0.078554,0.038207,0.252421,0.040486,4.384615,6.144625,0,17,...,13,0,2,13,-1,1,13,6,2,1


# label encoding

In [14]:
def label_encoding(df):
    
    cat_features_base = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ] 
    cat_features = [
        "{}_last".format(feature) for feature in cat_features_base
    ]
    
    for feature in cat_features:
        encoder = LabelEncoder()
        df[feature] = encoder.fit_transform(df[feature])
    
    return df

In [15]:
train = label_encoding(train)

# save files

In [16]:
train.to_parquet("../input/train.parquet")