In [8]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from statistics import mode
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder

In [58]:
raw_data = pd.read_csv("../data/train.csv")

In [60]:
def down_sampling(raw_data):
    df = raw_data.sort_values(['locdt', 'loctm'])
    
    # filter > 19 筆交易紀錄 for 卡號
    fraud = df[df["fraud_ind"] == 1]
    non_fraud = df[df["fraud_ind"] == 0]
    card_tx_hist = {}

    for idx, row in tqdm(non_fraud.iterrows()):
        cano = row["cano"]
        if cano in card_tx_hist:
            card_tx_hist[cano].append(idx)
        else:
            card_tx_hist[cano] = [idx]

    tx_sample = []

    for k, v in tqdm(card_tx_hist.items()):
        if len(v) > 17:
            card_tx_hist[k] = v[int(len(v) * 0.3) : ]
        tx_sample += card_tx_hist[k]

    sample_df = df.loc[tx_sample]
    
    conam_pct = df.loc[:,['cano','conam']].groupby('cano').mean()
    kmeans = KMeans(n_clusters=5, max_iter=1000).fit(conam_pct)
    conam_pct["cluster"] = kmeans.labels_
    
    sampling_cards = pd.Series(conam_pct[conam_pct["cluster"] == 0].index)
    sampling_df_c1 = sample_df[sample_df["cano"].isin(sampling_cards)].sample(frac=0.8)
    
    sampling_cards = pd.Series(conam_pct[conam_pct["cluster"] == 1].index)
    sampling_df_c2 = sample_df[sample_df["cano"].isin(sampling_cards)].sample(frac=0.8)
    
    sampling_df_c = [sampling_df_c1, sampling_df_c2]
    for i in range(2, 5):
        sampling_df_c.append(sample_df[sample_df["cano"].isin(pd.Series(conam_pct[conam_pct["cluster"] == i].index))])
    
    return pd.concat((*sampling_df_c, fraud), 0)

raw_data = down_sampling(raw_data)

0it [00:00, ?it/s]

  0%|          | 0/128439 [00:00<?, ?it/s]

In [27]:
def preprocessing(raw_data):
    df = raw_data
    
    col_names_cont = [] # 數值型資料
    col_names_disc = [] # 類別型資料
    col_has_na = [] # 待補NA資料
    for c in df.keys():
        uni = df[c].unique()
        n_na = pd.isna(df[c]).sum() # NA 數量
        if n_na > 0:
            col_has_na.append(c)       
        if len(uni) < 200 :
            print(f"{c}: uni={uni}")
            col_names_disc.append(c)
        else:
            info = [ df[c].max(), df[c].min(), df[c].mean(), df[c].std()]
            info = [ round(x,2) for x in info ]
            offset = 1 if n_na else 0
            diversity = (len(uni)-offset)/(len(df)-n_na)
            print(f"{c}: dtype={df[c].dtype}, n_na={n_na}")
            print("       max={}, min={}, mean={}, std={}, diversity={:.2f}%".format(*info, diversity*100 ) )
            if diversity == 1.0:
                print(f"       Delete col [{c}] due to diversity is 100% ")
            else:
                col_names_cont.append(c)

    df = df.fillna("NA")
    
    df_cont = df[col_names_cont].copy()
    scaler = StandardScaler()
    X_cont = scaler.fit_transform(df_cont)
    df_cont = pd.DataFrame(data=X_cont, index=df.index, columns=col_names_cont)
    
    df_disc = df[col_names_disc].copy()
    les = {}
    for c in col_names_disc:
        le = LabelEncoder()
        df_disc.loc[:,c] = le.fit_transform(df_disc.loc[:,c])
        les.update({c:le})
        
    ohe = OneHotEncoder(sparse=False)
    X_disc = ohe.fit_transform(df_disc)
    # cut_point = ohe.feature_indices_
    # print("feature cut point: ", cut_point)

    new_col_names_disc = []
    for c in col_names_disc: 
        le = les[c]
        new_col_names_disc += [ c+'_'+str(cl) for cl in le.classes_ ]
    assert len(new_col_names_disc) == X_disc.shape[1]

    df_disc = pd.DataFrame(data=X_disc, index=df.index, columns=new_col_names_disc)

    return pd.concat((df_cont, df_disc), 1), {
        "les": les,
        "scaler": scaler,
        "ohe": ohe,
    }

important_cols = ["acqic", "bacno", "cano", "conam", "contp", "ecfg", "flg_3dsmk", "hcefg", "insfg", "mcc", "mchno", "scity", "stocn"]
label = raw_data["fraud_ind"]
data, preprocessor = preprocessing(raw_data[important_cols])
data["fraud_ind"] = label

acqic: dtype=int64, n_na=0
       max=6884, min=0, mean=5994.93, std=1530.54, diversity=0.49%
bacno: dtype=int64, n_na=0
       max=163884, min=1, mean=82092.66, std=47337.04, diversity=8.31%
cano: dtype=int64, n_na=0
       max=213334, min=0, mean=108870.22, std=60988.91, diversity=11.26%
conam: dtype=float64, n_na=0
       max=7208.77, min=0.0, mean=658.0, std=412.02, diversity=5.45%
contp: uni=[5 2 1 4 6 3 0]
ecfg: uni=['N' 'Y']
flg_3dsmk: uni=['N' 'Y' nan]
hcefg: uni=[5 0 1 2 6 9 8 7 3]
insfg: uni=['N' 'Y']
mcc: dtype=int64, n_na=0
       max=459, min=0, mean=298.48, std=78.09, diversity=0.04%
mchno: dtype=int64, n_na=0
       max=103307, min=0, mean=56004.14, std=30785.11, diversity=6.88%
scity: dtype=int64, n_na=0
       max=6671, min=0, mean=4751.29, std=1980.53, diversity=0.44%
stocn: uni=[102  46  20  56 104  38  42  52  44  93  27  75  32   6  92  16  55  78
  49  36  98  62   5  25  68  34   1  45  73  61   2  76  26  60  67 101
  17  94  50  14  10  72  54  83  85  48  39 1

In [73]:
import numpy as np

def preprocessing(raw_data):
    labels = raw_data["fraud_ind"]
    df = raw_data.drop(["fraud_ind", "txkey"], 1)
    df = df.fillna("NA")
    
    les = {}
    for c in df.keys():
        if not (df[c].dtype == np.int64 or df[c].dtype == np.float64):
            le = LabelEncoder()
            df.loc[:,c] = le.fit_transform(df.loc[:,c])
            les.update({c:le})
    
    scaler = StandardScaler()
    x = scaler.fit_transform(df)
    df = pd.DataFrame(data=x, index=df.index, columns=df.columns)
    
    min_max_scaler = MinMaxScaler()
    df = pd.DataFrame(min_max_scaler.fit_transform(df.values), index=df.index, columns=df.columns)
    
    df["fraud_ind"] = labels
    
    return df, {
        "scaler": scaler,
        "les": les,
        "mms": min_max_scaler,
    }

data, scaler = preprocessing(raw_data)

In [77]:
data.to_csv("../data/train_norm.csv")

In [22]:
# 
min_max_scaler = MinMaxScaler()
data = pd.DataFrame(min_max_scaler.fit_transform(data.values), columns=data.columns)

In [14]:
data.to_csv("../data/preprocess_train.csv")

In [25]:
preprocessor['mms'] = min_max_scaler

In [78]:
import pickle

with open("../preprocessor.pkt", "wb") as f:
    pickle.dump(scaler, f)