In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_feather("../data/original/train_all.feather")
df_test = pd.read_feather("../data/original/test_all.feather")

In [3]:
df_train["TransactionDT"]

0            86400
1            86401
2            86469
3            86499
4            86506
            ...   
590535    15811047
590536    15811049
590537    15811079
590538    15811088
590539    15811131
Name: TransactionDT, Length: 590540, dtype: int32

In [4]:
df_test["TransactionDT"]

0         18403224
1         18403263
2         18403310
3         18403310
4         18403317
            ...   
506686    34214279
506687    34214287
506688    34214326
506689    34214337
506690    34214345
Name: TransactionDT, Length: 506691, dtype: int32

In [8]:
def _get_categorical_features(df):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    feats = [col for col in list(df.columns) if df[col].dtype not in numerics]

    cat_cols = []
    cat_cols.extend(["ProductCD"])
    cat_cols.extend(["card{}".format(x) for x in np.arange(1, 6+1)])
    cat_cols.extend(["addr1", "addr2"])
    cat_cols.extend(["P_emaildomain", "R_emaildomain"])
    cat_cols.extend(["M{}".format(x) for x in np.arange(1, 9+1)])
    cat_cols.extend(["DeviceType", "DeviceInfo"])
    cat_cols.extend(["id_{}".format(x) for x in np.arange(12, 38+1)])

    cat_cols = [x for x in df.columns if x in cat_cols]
    feats.extend([x for x in cat_cols if x not in feats])
    return feats

In [None]:
def make_hist(dfs, label, col, output_dir, xscale="log", yscale="linear", is_output_jpg=True):
    fig, axs = plt.subplots(2, 1, figsize=(20, 12))
    axs[0].hist(dfs, label=label, bins=20)
    axs[0].set_xlabel("scale:{}".format(xscale))
    axs[0].set_ylabel("scale:{}".format(yscale))
    axs[0].set_xscale(xscale)
    axs[0].set_yscale(yscale)
    axs[0].set_title(col)
    nan_counts = [df.isnull().sum() for df in dfs]
    
    axs[1].bar(np.arange(len(dfs)),
               nan_counts,
               tick_label=label)
    axs[1].set_title("{} [nan_counts]".format(title))
    plt.legend()
    if is_output_jpg:
        plt.savefig(output_dir)
    plt.show()

In [None]:
def make_bar(dfs, label, col, output_dir, log=True, top_num=20, is_output_jpg=True):
    counts = []
    for df in dfs:
        df = df.fillna("NAN")
        counts.append(df.value_counts())

    categories = []
    for c in counts:
        categories.extend(c.index[:top_num])

    if len(set(categories)) >= top_num:
        # class数が top_num以上
        categories.append("other")
    lb = LabelEncoder().fit(categories)

    width = 0.8 / len(dfs)
    plt.figure(figsize=(20, 8))
    for i in range(len(counts)):
        if len(lb.classes_) >= top_num:
            c_other_cols = [str(x) for x in counts[i].index if x not in lb.classes_]
            counts[i].index = [str(x) for x in counts[i].index]
            counts[i] = counts[i][lb.classes_].append(pd.Series([counts[i][c_other_cols].sum()], index=["other"]))

        plt.bar(lb.transform(counts[i].index) + width*(i+1) - 0.8,
                counts[i].values,
                width=width,
                log=log,
                label=label[i],
                tick_label=["c_{}".format(x) for x in counts[i].index])
    plt.legend()
    plt.xticks(rotation=45)
    plt.title("{}_y-axis=log:{}".format(col, log))
    if is_output_jpg:
        plt.savefig(output_dir)
    plt.show()

In [None]:
cat_feats = _get_categorical_features(df_test)
for col in df_test.columns:
    if col in cat_feats: # ("0")
         make_bar(dfs=[df_train[col], df_test[col]], 
                  label=["train", "test"],
                  col=col,
                  top_num=20,
                  log=False,
                  output_dir="split_dt_visualize/train-test/{}.jpg".format(col))
    else:
        dfs = [df_train[col], df_test[col]]
        label = ["train", "test"]
        make_hist(dfs=dfs, 
                  label=label, 
                  col=col,
                  xscale="linear",
                  yscale="linear",
                  output_dir="split_dt_visualize/train-test/{}.jpg".format(col))