In [1]:
# 구글 드라이브 입력
from google.colab import drive
drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


설치 해야하는 것들 및 함수들

In [2]:
# pip install 요소들 정리
# annoy는 반드시 visual studio build tools 설치
! pip install -q annoy
! pip install -q FRUFS
! pip install -q pacmap

In [3]:
# import할 요소들 정리
import copy
import pandas as pd
import numpy as np
import warnings
import pacmap

from FRUFS import FRUFS
from lightgbm import LGBMRegressor

from sklearn.ensemble import IsolationForest

from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score

from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

import matplotlib.pyplot as plt

import os

import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn import svm

from scipy.stats import ranksums

# 파일 위치 고정
os.chdir("/content/gdrive/MyDrive/creditcardfraud")

# Train dataset
train_df = pd.read_csv('train.csv')
train_df = train_df.iloc[:,1:]

# Validation dataset
val_df = pd.read_csv('val.csv')
ori_val_df = val_df.iloc[:,1:]
val_class = val_df.iloc[:,31]
val_df = val_df.iloc[:,1:31]

# Test dataset
test_df = pd.read_csv('test.csv')
test_df = test_df.iloc[:,1:]

warnings.filterwarnings(action='ignore')

In [4]:
# IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
def get_pred_label(model_pred):    
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

# IsolationForest 예측만 하는 함수
def iso_for_model_prediction(the_contamination, trtrtr):
    # train dataset으로 isolationforest 모델 학습
    model_only_train = IsolationForest(n_estimators=1000, contamination=the_contamination, verbose=0)
    model_only_train.fit(trtrtr)
    
    # train dataset의 isolationforest 모델로 예측
    train_pred = model_only_train.predict(trtrtr)
    train_pred = get_pred_label(train_pred)
    
    return train_pred

# Pacmac + IsolationForest 예측과 비교할 수 있는 함수
def pac_iso_for_model_comparing(the_contamination, trtrtr, low_dim, compare_class):
    # train dataset으로 isolationforest 모델 학습
    dlatl_embedding = pacmap.PaCMAP(n_components=low_dim, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, num_iters = 1000, verbose = True)
    pac_mac_train = dlatl_embedding.fit_transform(np.array(trtrtr), init="pca")
    
    model_train_compare = IsolationForest(n_estimators=1000, contamination=the_contamination, verbose=0)
    model_train_compare.fit(pac_mac_train)
    
    # train dataset의 isolationforest 모델로 예측
    train_pred = model_train_compare.predict(pac_mac_train)
    train_pred = get_pred_label(train_pred)
    
    # train dataset의 예측치와 compare data의 수치 비교
    train_score = f1_score(compare_class, train_pred, average='macro')

    print(f'Compared Macro F1 Score : [{train_score}]')
    print(classification_report(compare_class, train_score))

# IQR Method에서 경계값을 나타낸 함수
def iqr_outlier(ddff):
    q1 = ddff.quantile(0.25)
    q3 = ddff.quantile(0.75)

    iqr = q3 - q1

    lower_bound = q1 - (1.5 * iqr) 
    upper_bound = q3 + (1.5 * iqr)

    return pd.concat([lower_bound, upper_bound], axis= 1).T

Validation dataset의 통계정보를 이용한 1차로 변수 선택합니다. 테스트 데이터 중 랜덤 샘플된 것이므로 Validation dataset의 통계정보로 충분히 정보를 얻을 수 있습니다.아래의 기준은 outlier의 중위값이 IQR method의 범위 내에 있는지 확인합니다. 저는 outlier의 중위값이 적어도 IQR method의 범위 밖에 있어야 outlier 판단하기 쉽다고 생각하였습니다.

In [5]:
## 기본적인 변수 선택(1)
def first_variation_selection(dfdfdf):
    # Validation dataset의 outlier들의 중앙값이 Validation dataset의 IQR Method에서 경계값 사이에 있으면 이상치를 판단하지 못하는 변수라고 정했다.
    # class 열도 있으므로 1개 제외
    how_many_var = (len(dfdfdf.columns) - 1)
    new_var = []

    for what_val in range(how_many_var):
        if (iqr_outlier(dfdfdf).iloc[0,what_val] < dfdfdf.iloc[np.where(dfdfdf['Class'] == 1)].quantile(0.5)[what_val] < iqr_outlier(dfdfdf).iloc[1,what_val]):
            continue
        else:
            new_var.append(what_val)

    return new_var

superior_var1 = first_variation_selection(ori_val_df)
print(superior_var1)

[1, 2, 3, 6, 8, 9, 10, 11, 13, 15, 16, 17]


이번엔 2차 변수 선택입니다. 두 가지 변수 방법을 사용하여 결과물을 합집합화 하였습니다. 첫번째는 Wilcoxon rank-sum test를 이용해 Validation dataset의 outlier들의 중앙값과 Validation dataset의 inlier들의 중앙값의 차이를 검정해보고 p-value가 가장 낮은 5개 변수를 뽑았습니다. 두번째는 FRUFS의 LGBMRegressor을 이용하여 변수 중요도를 판단하였고 중요도가 가장 높은 5개 변수를 뽑았습니다.

In [6]:
## 기본적인 변수 선택(2)
# 2차 변수 선택
def second_variation_selection(dfdfdf, trtrtr, nnn_var):
    ranksum_pval = []
    for what_val in nnn_var:
        ranksum_pval.append(ranksums(dfdfdf.iloc[np.where(dfdfdf['Class'] == 1)].iloc[:,what_val], dfdfdf.iloc[np.where(dfdfdf['Class'] == 0)].iloc[:,what_val]).pvalue)

    Wilcoxon_rank_sum_pval_var = list(pd.DataFrame({'pval':ranksum_pval, 'col':dfdfdf.columns[nnn_var]}).sort_values(by=['pval']).iloc[range(5),1])

    print(Wilcoxon_rank_sum_pval_var)

    # FRUFS를 이용하여 변수 선택
    # core가 많으면 n_jobs 조정을 하면 됨.
    model_frufs = FRUFS(model_r=LGBMRegressor(random_state=28), k=5, n_jobs=-1, verbose=1, random_state=28)
    df_train_pruned = model_frufs.fit_transform(trtrtr.iloc[:,nnn_var])
    FRUFS_LGBMRegressor_var = list(df_train_pruned.columns)

    #plt.figure(figsize=(5, 6), dpi=100)
    #model_frufs.feature_importance()
    print(FRUFS_LGBMRegressor_var)

    # 종합
    all_var_in_td = list(trtrtr.columns)
    new_var_set = list(set(Wilcoxon_rank_sum_pval_var + FRUFS_LGBMRegressor_var))
    new_var_list = []

    for nvs in new_var_set:
        new_var_list.append(all_var_in_td.index(nvs))

    new_var_list.sort()
    return new_var_list

superior_var2 = second_variation_selection(ori_val_df, train_df, superior_var1)
print(superior_var2)

['V10', 'V14', 'V11', 'V4', 'V12']


[Parallel(n_jobs=60)]: Using backend LokyBackend with 60 concurrent workers.
[Parallel(n_jobs=60)]: Done  10 out of  12 | elapsed:  1.0min remaining:   12.2s


['V10', 'V4', 'V14', 'V17', 'V16']
[3, 9, 10, 11, 13, 15, 16]


[Parallel(n_jobs=60)]: Done  12 out of  12 | elapsed:  1.0min finished


In [7]:
## 기본적인 변수 선택(3)
# 선택한 변수 제외 나머지 변수 모임
inferior_var2 = [x for x in range(30) if x not in superior_var2]
print(inferior_var2)

[0, 1, 2, 4, 5, 6, 7, 8, 12, 14, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


Baseline에 보시면 아시겠지만 validation set의 사기 거래 비율을 탐색하였는데 저는 이보다 조금 높은 수치를 사용하였습니다.

In [8]:
## validation set의 사기 거래 비율 탐색
ori_val_normal, ori_val_fraud = ori_val_df['Class'].value_counts()
ori_val_contamination = ori_val_fraud / ori_val_normal
print(f'Validation contamination:[{ori_val_contamination}]')
# 이대로 하지 않고 조정을 함

Validation contamination:[0.0010551491277433877]


본격적으로 pacmap과 isolation forest를 이용하여 예측하고자 합니다. pacmap은 차원축소를 하는 과정에 있어 랜덤하게 이동합니다. isolation forest도 랜덤하게 변수를 선택하기 때문에 결과가 불분명합니다. 따라서 저는 hhmm번 진행하고 voting을 통해 결과를 도출하였습니다.

In [10]:
## pacmap과 isolation forest 1차 이용(1)
# pacmap과 isolation forest를 이용한 1차 예측
hhmm = 3
what_val = superior_var2
for num in range(hhmm):
    embedding_1 = pacmap.PaCMAP(n_components=len(what_val), n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, num_iters = 1000, verbose = True)
    pacmac_train_1 = embedding_1.fit_transform(np.array(train_df.iloc[:,what_val]), init="pca")
    pacmac_val_1 = embedding_1.transform(np.array(val_df.iloc[:,what_val]), basis=np.array(train_df.iloc[:,what_val]))
    pacmac_test_1 = embedding_1.transform(np.array(test_df.iloc[:,what_val]), basis=np.array(train_df.iloc[:,what_val]))

    pac_model_1 = IsolationForest(n_estimators=1000, contamination=0.00121, verbose=0)
    pac_model_1.fit(pacmac_train_1[:,[1,2,3]]) 

    if num == 0:
        train_pred_set_1 = pac_model_1.predict(pacmac_train_1[:,[1,2,3]]) # model prediction
        train_pred_set_1 = get_pred_label(train_pred_set_1)

        val_pred_set_1 = pac_model_1.predict(pacmac_val_1[:,[1,2,3]]) # model prediction
        val_pred_set_1 = get_pred_label(val_pred_set_1)

        test_pred_set_1 = pac_model_1.predict(pacmac_test_1[:,[1,2,3]]) # model prediction
        test_pred_set_1 = get_pred_label(test_pred_set_1)
    else:
        train_pred_1 = pac_model_1.predict(pacmac_train_1[:,[1,2,3]]) # model prediction
        train_pred_1 = get_pred_label(train_pred_1)
        train_pred_set_1 = train_pred_set_1 + train_pred_1

        val_pred_1 = pac_model_1.predict(pacmac_val_1[:,[1,2,3]]) # model prediction
        val_pred_1 = get_pred_label(val_pred_1)
        val_pred_set_1 = val_pred_set_1 + val_pred_1

        test_pred_1 = pac_model_1.predict(pacmac_test_1[:,[1,2,3]]) # model prediction
        test_pred_1 = get_pred_label(test_pred_1)
        test_pred_set_1 = test_pred_set_1 + test_pred_1

train_pred_set_1 = train_pred_set_1/hhmm
val_pred_set_1 = val_pred_set_1/hhmm
test_pred_set_1 = test_pred_set_1/hhmm

X is normalized
PaCMAP(n_neighbors=26, n_MN=13, n_FP=52, distance=euclidean, lr=1.0, n_iters=1000, apply_pca=True, opt_method='adam', verbose=True, intermediate=False, seed=None)
Finding pairs
Found nearest neighbor
Calculated sigma
Found scaled dist
Pairs sampled successfully.
((2959892, 2), (1479946, 2), (5919784, 2))
Initial Loss: 3658542.0
Iteration:   10, Loss: 2403497.750000
Iteration:   20, Loss: 2149115.500000
Iteration:   30, Loss: 2015282.625000
Iteration:   40, Loss: 1911173.750000
Iteration:   50, Loss: 1811854.250000
Iteration:   60, Loss: 1706509.750000
Iteration:   70, Loss: 1588218.500000
Iteration:   80, Loss: 1448030.250000
Iteration:   90, Loss: 1271641.875000
Iteration:  100, Loss: 1005694.500000
Iteration:  110, Loss: 1315544.375000
Iteration:  120, Loss: 1293438.500000
Iteration:  130, Loss: 1283935.000000
Iteration:  140, Loss: 1280798.875000
Iteration:  150, Loss: 1280236.500000
Iteration:  160, Loss: 1280233.000000
Iteration:  170, Loss: 1280348.375000
Iteratio

In [12]:
## pacmap과 isolation forest 1차 이용(2)
val_score_1 = f1_score(ori_val_df['Class'], np.round(val_pred_set_1), average='macro')

print(f'Validation F1 Score : [{val_score_1}]')
print(classification_report(ori_val_df['Class'], np.round(val_pred_set_1)))
print(confusion_matrix(ori_val_df['Class'], np.round(val_pred_set_1)))

Validation F1 Score : [0.9209734995691702]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.89      0.80      0.84        30

    accuracy                           1.00     28462
   macro avg       0.94      0.90      0.92     28462
weighted avg       1.00      1.00      1.00     28462

[[28429     3]
 [    6    24]]


In [13]:
## pacmap과 isolation forest 1차 이용(3)
# 1차 저장
chujung_train_1 = pd.DataFrame({'Class':np.round(train_pred_set_1)})
chujung_val_1 = pd.DataFrame({'Class':np.round(val_pred_set_1)})
chujung_test_1 = pd.DataFrame({'Class':np.round(test_pred_set_1)})

result_train_1 = pd.concat([train_df,chujung_train_1], axis=1)
result_val_1 = pd.concat([val_df,chujung_val_1], axis=1)
result_test_1 = pd.concat([test_df,chujung_test_1], axis=1)

result_train_1.to_csv('result_train_1.csv', index=False)
result_val_1.to_csv('result_val_1.csv', index=False)
result_test_1.to_csv('result_test_1.csv', index=False)

In [9]:
## pacmap과 isolation forest 1차 이용(4)
# 1차 불러오기
train_pred_set_1 = np.array(pd.read_csv('result_train_1.csv')['Class'])
val_pred_set_1 = np.array(pd.read_csv('result_val_1.csv')['Class'])
test_pred_set_1 = np.array(pd.read_csv('result_test_1.csv')['Class'])

저는 이에 더해 변수를 좀 더 추가하면서 val score를 높일 수 있을까 생각하였습니다. 그래서 저는 설령 outlier들이 inlier안에 숨어있을지라도 inlier들이 모여있으면 isolation forest가 판단하는데 있어서 도움이 된다고 들었습니다. 이에 따라 저는 outlier들이 inlier안에 들어있지만 최대한 outlier들의 분산이 적으면서 최대한 inlier들의 분산을 넓은 변수를 찾고자 하였습니다. (i.e. (inlier들의 분산/outlier들의 분산)이 높은 것들). 역시 validation set의 통계정보를 이용하였습니다. 그 다음, outlier의 중앙값을 기준으로 inlier들을 나누어서, 각 inlier들의 중앙값이 outlier의 중앙값과 차이가 큰지(wilcoxon rank sum test), 각 inlier들의 분산이 작은지(해당 분산이 작으면 inlier와 구별할 수 있다고 생각하였습니다.)를 보았습니다. 변수를 최대한 적게 선택하고자 이번엔 해당 방법들에서 나온 변수들의 교집합을 선택하였습니다.

In [10]:
## pacmap과 isolation forest 2차 이용
inside_in_inlier= []

for what_val in range(30):
    if ori_val_df.iloc[np.where(ori_val_df['Class'] == 1)].max()[what_val] <  ori_val_df.iloc[np.where(ori_val_df['Class'] == 0)].max()[what_val]:
        if ori_val_df.iloc[np.where(ori_val_df['Class'] == 1)].min()[what_val] >  ori_val_df.iloc[np.where(ori_val_df['Class'] == 0)].min()[what_val]:
            inside_in_inlier.append(what_val)

print(inside_in_inlier)

var_chai = []
for what_val in inside_in_inlier:
    print(what_val , ori_val_df.iloc[np.where(ori_val_df['Class'] == 0)[0],what_val].var()/ori_val_df.iloc[np.where(ori_val_df['Class'] == 1)[0],what_val].var())
    var_chai.append(ori_val_df.iloc[np.where(ori_val_df['Class'] == 0)[0],what_val].var()/ori_val_df.iloc[np.where(ori_val_df['Class'] == 1)[0],what_val].var())

old_born_idx = np.argsort((-1)*np.array(var_chai))[:5]
old_born_idx = np.array(inside_in_inlier)[list(old_born_idx)]
old_born_idx = list(old_born_idx)
print(old_born_idx)

left_side_l = []
right_side_l = []
for jjkk in inside_in_inlier:
    the_one_the_one = ori_val_df.iloc[np.where(ori_val_df['Class'] == 1)[0],jjkk]
    median_the_one = the_one_the_one.median()
    the_zero_the_zero = ori_val_df.iloc[np.where(ori_val_df['Class'] == 0)[0],jjkk]
    left_zero_val_df = the_zero_the_zero.iloc[np.where(the_zero_the_zero < median_the_one)[0]]
    right_zero_val_df = the_zero_the_zero.iloc[np.where(the_zero_the_zero > median_the_one)[0]]
    left_ppp = (ranksums(left_zero_val_df, the_one_the_one).pvalue)
    right_ppp = (ranksums(right_zero_val_df, the_one_the_one).pvalue)
    
    left_side_l.append((left_ppp*1000)*((left_zero_val_df.var())))
    right_side_l.append((right_ppp*1000)*((right_zero_val_df.var())))

left_born_idx = np.argsort(np.array(left_side_l))[:5]
left_born_idx = np.array(inside_in_inlier)[list(left_born_idx)]
left_born_idx = list(left_born_idx)

right_born_idx = np.argsort(np.array(right_side_l))[:5]
right_born_idx = np.array(inside_in_inlier)[list(right_born_idx)]
right_born_idx = list(right_born_idx)

print(left_born_idx)
print(right_born_idx)

dhk_add_list = list(set(old_born_idx) & set(left_born_idx) & set(right_born_idx))
print(dhk_add_list)

[0, 4, 5, 12, 14, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29]
0 0.05661145916743179
4 0.04223885928210881
5 0.7704461524323427
12 0.7649192537628348
14 0.592450840972954
18 0.20398714518238045
19 0.3715199816175286
20 0.0668143847139044
21 0.47076015282831585
22 0.516526376951607
23 1.3711184741040796
24 0.4679685966190287
25 0.7132699704559123
26 0.037185842989192905
27 0.2518467335441023
29 0.782343614593578
[23, 29, 5, 12, 25]
[25, 21, 23, 24, 20]
[5, 0, 24, 23, 29]
[23]


지금까지 선택한 변수들을 바탕으로 pacmap과 isolation forest를 돌려 예측합니다.

In [11]:
## pacmap과 isolation forest 3차 이용(1)
# pacmap과 isolation forest를 이용한 2차 예측
hhmm = 7
vlskffo_var = superior_var2 + dhk_add_list
what_val = vlskffo_var

for num in range(hhmm):
    embedding_3 = pacmap.PaCMAP(n_components=len(what_val), n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, num_iters = 600, verbose = True)
    pacmac_train_3 = embedding_3.fit_transform(np.array(train_df.iloc[:,what_val]), init="pca")
    pacmac_val_3 = embedding_3.transform(np.array(val_df.iloc[:,what_val]), basis=np.array(train_df.iloc[:,what_val]))
    pacmac_test_3 = embedding_3.transform(np.array(test_df.iloc[:,what_val]), basis=np.array(train_df.iloc[:,what_val]))

    pac_model_3 = IsolationForest(n_estimators=1000, contamination=0.00121, verbose=0)
    pac_model_3.fit(pacmac_train_3[:,[1,2,3]]) 

    if num == 0:
        train_pred_set_3 = pac_model_3.predict(pacmac_train_3[:,[1,2,3]]) # model prediction
        train_pred_set_3 = get_pred_label(train_pred_set_3)

        val_pred_set_3 = pac_model_3.predict(pacmac_val_3[:,[1,2,3]]) # model prediction
        val_pred_set_3 = get_pred_label(val_pred_set_3)

        test_pred_set_3 = pac_model_3.predict(pacmac_test_3[:,[1,2,3]]) # model prediction
        test_pred_set_3 = get_pred_label(test_pred_set_3)
    else:
        train_pred_3 = pac_model_3.predict(pacmac_train_3[:,[1,2,3]]) # model prediction
        train_pred_3 = get_pred_label(train_pred_3)
        train_pred_set_3 = train_pred_set_3 + train_pred_3

        val_pred_3 = pac_model_3.predict(pacmac_val_3[:,[1,2,3]]) # model prediction
        val_pred_3 = get_pred_label(val_pred_3)
        val_pred_set_3 = val_pred_set_3 + val_pred_3

        test_pred_3 = pac_model_3.predict(pacmac_test_3[:,[1,2,3]]) # model prediction
        test_pred_3 = get_pred_label(test_pred_3)
        test_pred_set_3 = test_pred_set_3 + test_pred_3

train_pred_set_3 = train_pred_set_3/hhmm
val_pred_set_3 = val_pred_set_3/hhmm
test_pred_set_3 = test_pred_set_3/hhmm

X is normalized
PaCMAP(n_neighbors=26, n_MN=13, n_FP=52, distance=euclidean, lr=1.0, n_iters=600, apply_pca=True, opt_method='adam', verbose=True, intermediate=False, seed=None)
Finding pairs
Found nearest neighbor
Calculated sigma
Found scaled dist
Pairs sampled successfully.
((2959892, 2), (1479946, 2), (5919784, 2))
Initial Loss: 3658542.0
Iteration:   10, Loss: 2461965.750000
Iteration:   20, Loss: 2168983.250000
Iteration:   30, Loss: 2027381.000000
Iteration:   40, Loss: 1920832.500000
Iteration:   50, Loss: 1820802.500000
Iteration:   60, Loss: 1715505.375000
Iteration:   70, Loss: 1597789.750000
Iteration:   80, Loss: 1458907.625000
Iteration:   90, Loss: 1283804.875000
Iteration:  100, Loss: 1020497.750000
Iteration:  110, Loss: 1339413.250000
Iteration:  120, Loss: 1314553.250000
Iteration:  130, Loss: 1303085.125000
Iteration:  140, Loss: 1299084.250000
Iteration:  150, Loss: 1298269.625000
Iteration:  160, Loss: 1298299.000000
Iteration:  170, Loss: 1298458.125000
Iteration

In [12]:
## pacmap과 isolation forest 3차 이용(2)
val_score_3 = f1_score(ori_val_df['Class'], np.round(val_pred_set_3), average='macro')

print(f'Validation F1 Score : [{val_score_3}]')
print(classification_report(ori_val_df['Class'], np.round(val_pred_set_3)))
print(confusion_matrix(ori_val_df['Class'], np.round(val_pred_set_3)))

Validation F1 Score : [0.9309641419574388]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.89      0.83      0.86        30

    accuracy                           1.00     28462
   macro avg       0.95      0.92      0.93     28462
weighted avg       1.00      1.00      1.00     28462

[[28429     3]
 [    5    25]]


In [26]:
set(test_pred_set_3)

{0.0, 1.0}

In [14]:
## pacmap과 isolation forest 3차 이용(3)
# 2차 저장
chujung_train_2 = pd.DataFrame({'Class':np.round(train_pred_set_3)})
chujung_val_2 = pd.DataFrame({'Class':np.round(val_pred_set_3)})
chujung_test_2 = pd.DataFrame({'Class':np.round(test_pred_set_3)})

result_train_2 = pd.concat([train_df,chujung_train_2], axis=1)
result_val_2 = pd.concat([val_df,chujung_val_2], axis=1)
result_test_2 = pd.concat([test_df,chujung_test_2], axis=1)

result_train_2.to_csv('result_train_2.csv', index=False)
result_val_2.to_csv('result_val_2.csv', index=False)
result_test_2.to_csv('result_test_2.csv', index=False)

In [28]:
## pacmap과 isolation forest 3차 이용(4)
# 2차 불러오기 및 outlier된 dataset 모음
train_pred_set_3 = np.array(pd.read_csv('first_result_train.csv')['Class'])
val_pred_set_3 = np.array(pd.read_csv('first_result_val.csv')['Class'])
test_pred_set_3 = np.array(pd.read_csv('first_result_test.csv')['Class'])

one_train_df = train_df.iloc[np.where(np.round(train_pred_set_3) == 1)[0]]
one_val_df = ori_val_df.iloc[np.where(np.round(val_pred_set_3) == 1)[0]]
one_test_df = test_df.iloc[np.where(np.round(test_pred_set_3) == 1)[0]]

저는 이에 더해 판별된 outlier들 중에 가짜를 도출하기로 생각했습니다.
역시 저는 변수를 선택하였고 validation dataset의 통계정보를 이용하였습니다. 이번엔 다른 방식으로 판별하고자 합니다. 개인적으로 저는 isolation forest를 사용함에도 표출이 안 된것은 특정 변수하에서는 가짜 outlier들이 진짜 outlier안에 숨어있다고 생각하였습니다. 그에 따라 KernelPCA를 이용해서 안쪽에 있는 가짜 outlier을 빼내고, pacmap을 사용한 다음, pca를 통해 좌표를 바꾸고, 극단값 중 몇몇개를 뽑아서 가짜 outlier라고 생각하였습니다. 우선, Validation dataset 내 ouliter로 판별된 것 중에서 실제 outlier들의 중위값과 가짜 outlier들의 중위값 검정 비교하였고, Validation dataset 내 ouliter로 판별된 것 중에서 실제 outlier들의 분산와 가짜 outlier 비율 체크해서 변수들을 확인하였습니다.

In [29]:
## outlier 중 가짜 판별(1)
# 변수 선택

rrr_df = []
for what_val in range(30):
    rrr_df.append(ranksums(one_val_df.iloc[np.where(one_val_df['Class'] == 1)[0],what_val],one_val_df.iloc[np.where(one_val_df['Class'] == 0)[0],what_val]).pvalue)

rrr_idx = np.where(np.array(rrr_df) > 0.05)[0]

qqq_df = []
for what_val in rrr_idx:
    qqq_df.append(one_val_df.iloc[np.where(one_val_df['Class'] == 0)[0],what_val].var()/one_val_df.iloc[np.where(one_val_df['Class'] == 1)[0],what_val].var())

new_born_idx = np.argsort(qqq_df)[:5]

In [34]:
print(new_born_idx)

array([20, 23, 14,  3, 18])

다음으로 가짜 판별하는데 있어서 위에서 이야기한 그대로 바탕으로 진행하였습니다. 가짜 outlier인지 아닌지 판별하는데 있어서 Validation set의 통계정보와 wilcoxon rank sum test를 이용하였습니다. 

In [35]:
## outlier 중 가짜 판별(2)
# 판별
transformer = KernelPCA(n_components=5, kernel='rbf')
hhhhh = new_born_idx
can_sepearate = np.argsort(rrr_df)[0]

X_transformed = transformer.fit_transform(one_train_df.iloc[:,hhhhh])
X1_transformed = transformer.transform(one_val_df.iloc[:,hhhhh])
X2_transformed = transformer.transform(one_test_df.iloc[:,hhhhh])

# kernel pca 후 1에서 0 찾기 by pacmap 그후 판별
hhmm = 201
skip_count = 0

contamin = 0.0725
making_set = 0

for num in range(hhmm):
    embedding1010 = pacmap.PaCMAP(n_components=3, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0, num_iters = 1000, verbose = False)
    pacmac_train1010 = embedding1010.fit_transform(X_transformed, init="pca")
    pacmac_val1010 = embedding1010.transform(X1_transformed, basis=X_transformed)
    pacmac_test1010 = embedding1010.transform(X2_transformed, basis=X_transformed)

    # 그냥 pca
    pcawow = PCA(n_components = 3) # 주성분을 몇개로 할지 결정
    pca_train_wow = pcawow.fit_transform(pacmac_train1010)
    pca_train_wow = pd.DataFrame(data=pca_train_wow)

    pca_val_wow = pcawow.transform(pacmac_val1010)
    pca_val_wow = pd.DataFrame(data=pca_val_wow)

    pca_test_wow = pcawow.transform(pacmac_test1010)
    pca_test_wow = pd.DataFrame(data=pca_test_wow)

    rlwns = int(contamin*len(pca_train_wow))
    thtn = contamin*len(pca_train_wow) - rlwns
    
    if making_set == 0:
        max_part_zero = 0
        max_part_one = 0
        max_what = 100
        
        # PC 축을 설정하고 좌 또는 우 극단값을 찾고 해당하는 중위값이 실제 outlier들의 중위값인지 체크
        for jrj in range(6):
            val_pred_set_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_val_wow))})       

            if jrj % 2 == 0:                
                rlwns_val = pca_train_wow.iloc[:,(jrj//2)].nlargest(rlwns).iloc[(rlwns-1)]
                rlwns_val_Qkd = pca_train_wow.iloc[:,(jrj//2)].nlargest((rlwns+1)).iloc[(rlwns)]

                val_pred_set_1010.iloc[np.where(pca_val_wow.iloc[:,(jrj//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn) ))] = 0
            else:
                rlwns_val = pca_train_wow.iloc[:,(jrj//2)].nsmallest(rlwns).iloc[(rlwns-1)]
                rlwns_val_Qkd = pca_train_wow.iloc[:,(jrj//2)].nsmallest((rlwns+1)).iloc[(rlwns)]

                val_pred_set_1010.iloc[np.where(pca_val_wow.iloc[:,(jrj//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0

            ranksum_pval_ed_one = ranksums(one_val_df.iloc[np.where(one_val_df['Class'] == 1)[0],can_sepearate],one_val_df.iloc[np.where(val_pred_set_1010['Class'] == 1)[0],can_sepearate]).pvalue
            ranksum_pval_ed_zero = ranksums(one_val_df.iloc[np.where(one_val_df['Class'] == 0)[0],can_sepearate],one_val_df.iloc[np.where(val_pred_set_1010['Class'] == 0)[0],can_sepearate]).pvalue

            if np.isnan(ranksum_pval_ed_zero):
                continue
            
            if max_part_zero <= ranksum_pval_ed_zero:
                max_part_zero = ranksum_pval_ed_zero
            if max_part_one <= ranksum_pval_ed_one:
                max_part_one = ranksum_pval_ed_one

            if max_part_zero >= 0.5:
                if max_part_one >= 0.5:
                    max_what = jrj
                    break

        if max_what == 100:
            skip_count += 1
            continue
        making_set += 10
        
        # 체크 후 해당하는 축으로 가짜 outlier 판별
        val_pred_set_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_val_wow))})

        if max_what % 2 == 0:
            rlwns_val = pca_train_wow.iloc[:,(max_what//2)].nlargest(rlwns).iloc[(rlwns-1)]
            rlwns_val_Qkd = pca_train_wow.iloc[:,(max_what//2)].nlargest((rlwns+1)).iloc[(rlwns)]

            val_pred_set_1010.iloc[np.where(pca_val_wow.iloc[:,(max_what//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0

        else:
            rlwns_val = pca_train_wow.iloc[:,(max_what//2)].nsmallest(rlwns).iloc[(rlwns-1)]
            rlwns_val_Qkd = pca_train_wow.iloc[:,(max_what//2)].nsmallest((rlwns+1)).iloc[(rlwns)]

            val_pred_set_1010.iloc[np.where(pca_val_wow.iloc[:,(max_what//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0
        
        train_pred_set_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_train_wow))})

        if max_what % 2 == 0:
            train_pred_set_1010.iloc[np.where(pca_train_wow.iloc[:,(max_what//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0
        else:
            train_pred_set_1010.iloc[np.where(pca_train_wow.iloc[:,(max_what//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0   

        test_pred_set_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_test_wow))})

        if max_what % 2 == 0:
            test_pred_set_1010.iloc[np.where(pca_test_wow.iloc[:,(max_what//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0
        else:
            test_pred_set_1010.iloc[np.where(pca_test_wow.iloc[:,(max_what//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0 

    else:
        max_part_zero = 0
        max_part_one = 0
        max_what = 100
        
        # PC 축을 설정하고 좌 또는 우 극단값을 찾고 해당하는 중위값이 실제 outlier들의 중위값인지 체크
        for jrj in range(6):
            val_pred_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_val_wow))})

            if jrj % 2 == 0:
                rlwns_val = pca_train_wow.iloc[:,(jrj//2)].nlargest(rlwns).iloc[(rlwns-1)]
                rlwns_val_Qkd = pca_train_wow.iloc[:,(jrj//2)].nlargest((rlwns+1)).iloc[(rlwns)]

                val_pred_1010.iloc[np.where(pca_val_wow.iloc[:,(jrj//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn) ))] = 0
            else:
                rlwns_val = pca_train_wow.iloc[:,(jrj//2)].nsmallest(rlwns).iloc[(rlwns-1)]
                rlwns_val_Qkd = pca_train_wow.iloc[:,(jrj//2)].nsmallest((rlwns+1)).iloc[(rlwns)]

                val_pred_1010.iloc[np.where(pca_val_wow.iloc[:,(jrj//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0

            ranksum_pval_ed_one = ranksums(one_val_df.iloc[np.where(one_val_df['Class'] == 1)[0],can_sepearate],one_val_df.iloc[np.where(val_pred_1010['Class'] == 1)[0],can_sepearate]).pvalue
            ranksum_pval_ed_zero = ranksums(one_val_df.iloc[np.where(one_val_df['Class'] == 0)[0],can_sepearate],one_val_df.iloc[np.where(val_pred_1010['Class'] == 0)[0],can_sepearate]).pvalue

            if np.isnan(ranksum_pval_ed_zero):
                continue
            
            if max_part_zero <= ranksum_pval_ed_zero:
                max_part_zero = ranksum_pval_ed_zero
            if max_part_one <= ranksum_pval_ed_one:
                max_part_one = ranksum_pval_ed_one

            if max_part_zero >= 0.5:
                if max_part_one >= 0.5:
                    max_what = jrj
                    break
        
        if max_what == 100:
            skip_count += 1
            continue
        
        # 체크 후 해당하는 축으로 가짜 outlier 판별
        val_pred_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_val_wow))})

        if max_what % 2 == 0:
            rlwns_val = pca_train_wow.iloc[:,(max_what//2)].nlargest(rlwns).iloc[(rlwns-1)]
            rlwns_val_Qkd = pca_train_wow.iloc[:,(max_what//2)].nlargest((rlwns+1)).iloc[(rlwns)]

            val_pred_1010.iloc[np.where(pca_val_wow.iloc[:,(max_what//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0
        else:
            rlwns_val = pca_train_wow.iloc[:,(max_what//2)].nsmallest(rlwns).iloc[(rlwns-1)]
            rlwns_val_Qkd = pca_train_wow.iloc[:,(max_what//2)].nsmallest((rlwns+1)).iloc[(rlwns)]

            val_pred_1010.iloc[np.where(pca_val_wow.iloc[:,(max_what//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0

        ranksum_pval_ed_one = ranksums(one_val_df.iloc[np.where(one_val_df['Class'] == 1)[0],can_sepearate],one_val_df.iloc[np.where(val_pred_1010['Class'] == 1)[0],can_sepearate]).pvalue
        ranksum_pval_ed_zero = ranksums(one_val_df.iloc[np.where(one_val_df['Class'] == 0)[0],can_sepearate],one_val_df.iloc[np.where(val_pred_1010['Class'] == 0)[0],can_sepearate]).pvalue

        val_pred_set_1010 = val_pred_set_1010 + val_pred_1010

        train_pred_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_train_wow))})

        if max_what % 2 == 0:
            train_pred_1010.iloc[np.where(pca_train_wow.iloc[:,(max_what//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0
        else:
            train_pred_1010.iloc[np.where(pca_train_wow.iloc[:,(max_what//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0   

        train_pred_set_1010 = train_pred_set_1010 + train_pred_1010

        test_pred_1010 = pd.DataFrame({'Class':np.repeat(1,len(pca_test_wow))})

        if max_what % 2 == 0:
            test_pred_1010.iloc[np.where(pca_test_wow.iloc[:,(max_what//2)] >= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0
        else:
            test_pred_1010.iloc[np.where(pca_test_wow.iloc[:,(max_what//2)] <= (rlwns_val_Qkd*thtn  + rlwns_val*(1-thtn)))] = 0   

        test_pred_set_1010 = test_pred_set_1010 + test_pred_1010

train_pred_set_1010 = train_pred_set_1010/(hhmm- skip_count)
val_pred_set_1010 = val_pred_set_1010/(hhmm-skip_count)
test_pred_set_1010 = test_pred_set_1010/(hhmm-skip_count)

In [36]:
## 최종 확인
# voting해서 50% 이상 1이면 1로, 반대면 0으로
final_one_train_df = one_train_df.iloc[np.where(np.round(train_pred_set_1010['Class']) == 1)[0]]
final_one_val_df = one_val_df.iloc[np.where(np.round(val_pred_set_1010['Class']) == 1)[0]]
final_one_test_df = one_test_df.iloc[np.where(np.round(test_pred_set_1010['Class']) == 1)[0]]

# validation score 확인
chujung_train2 = pd.DataFrame({'Class':np.repeat(0,len(train_df))})
chujung_val2 = pd.DataFrame({'pre_Class':np.repeat(0,len(val_df))})
chujung_test2 = pd.DataFrame({'Class':np.repeat(0,len(test_df))})

chujung_train2.iloc[final_one_train_df.index,0] = 1
chujung_val2.iloc[final_one_val_df.index,0] = 1
chujung_test2.iloc[final_one_test_df.index,0] = 1

sec_result_train = pd.concat([train_df,chujung_train2], axis=1)
sec_result_val = pd.concat([val_df,chujung_val2], axis=1)
sec_result_test = pd.concat([test_df,chujung_test2], axis=1)

print('F1-score',f1_score(sec_result_val['pre_Class'], val_class, average='macro'))
print(confusion_matrix(sec_result_val['pre_Class'], val_class))
print(classification_report(sec_result_val['pre_Class'], val_class))

# 저장
submit = pd.read_csv('sample_submission.csv')
submit['Class'] = 0

submit.iloc[final_one_test_df.index,1] = 1
submit.iloc[final_one_test_df.index,1]

submit.to_csv('result_submit.csv', index=False)

F1-score 0.954501493863888
[[28432     5]
 [    0    25]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28437
           1       0.83      1.00      0.91        25

    accuracy                           1.00     28462
   macro avg       0.92      1.00      0.95     28462
weighted avg       1.00      1.00      1.00     28462

