In [1]:
import pandas as pd
import numpy as np

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df


Mem. usage decreased to 650.48 Mb (66.8% reduction)


In [15]:
def aggregate(df):
    numeric_cols = []
    numeric_cols.extend(["C{}".format(x) for x in np.arange(1, 14+1)])
    numeric_cols.extend(["D{}".format(x) for x in np.arange(1, 15+1)])
    numeric_cols.extend(["V{}".format(x) for x in np.arange(1, 339+1)])
    numeric_cols.extend(["id_{:2d}".format(x).replace(" ", "0").format(x) for x in np.arange(1, 11+1)])
    key_cols = ["card1", "card2", "card3", "card4", "card5", "card6"]
    aggs = ["count", "sum", "mean", "std", "max", "min"]
    for agg in aggs:
        print(agg)
        df_merge = df[numeric_cols+key_cols].drop("isFraud", axis=1, errors="ignore").groupby(key_cols).transform(agg)
        df_merge.columns = ["{}_{}".format(agg, x) for x in df_merge.columns]
        df = pd.concat([df, df_merge], axis=1)
        # df_train = reduce_mem_usage(df_train)
    return df

In [None]:
df_train = pd.merge(
    pd.read_csv("../data/original/train_transaction.csv"),
    pd.read_csv("../data/original/train_identity.csv"),
    how="left", on="TransactionID"
)
df_train = reduce_mem_usage(df_train)
df_test = aggregate(df_test)
df_test = reduce_mem_usage(df_test)
df_test.to_csv("train.csv")

In [18]:
df_test = pd.merge(
    pd.read_csv("../data/original/test_transaction.csv"),
    pd.read_csv("../data/original/test_identity.csv"),
    how="left", on="TransactionID"
)
df_test = reduce_mem_usage(df_test)
df_test = aggregate(df_test)
df_test = reduce_mem_usage(df_test)
df_test.to_csv("test.csv")

Mem. usage decreased to 565.37 Mb (66.3% reduction)
count
sum
mean
std
max
min
Mem. usage decreased to 3249.16 Mb (21.1% reduction)
