In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 시드고정

In [2]:
import random
import os
import numpy as np
import torch
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(41) # Seed 고정

## Import

In [3]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [4]:
train_df = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/train.csv')
val_df = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/val.csv')
test = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/test.csv')

print(train_df.shape, val_df.shape, test.shape)

(113842, 31) (28462, 32) (142503, 31)


In [5]:
train_df = train_df.drop(train_df.columns[1:], axis=1)
train_df

Unnamed: 0,ID
0,3
1,4
2,6
3,8
4,9
...,...
113837,284796
113838,284797
113839,284798
113840,284802


In [6]:
test = test.drop(test.columns[1:], axis=1)
test

Unnamed: 0,ID
0,AAAA0x1
1,AAAA0x2
2,AAAA0x5
3,AAAA0x7
4,AAAA0xc
...,...
142498,0x4587f
142499,0x45880
142500,0x45884
142501,0x45885


In [7]:
val_df = val_df.drop(val_df.columns[1:-1], axis=1)
val_df

Unnamed: 0,ID,Class
0,10,0
1,22,0
2,63,0
3,69,0
4,83,0
...,...,...
28457,284769,0
28458,284779,0
28459,284790,0
28460,284801,0


In [8]:
train_tsne = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/sohyeon/t-sne files for train,test,valid/tsne_train_standardscaled.csv')
val_tsne = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/sohyeon/t-sne files for train,test,valid/tsne_valid_standardscaled.csv')
test_tsne = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/sohyeon/t-sne files for train,test,valid/tsne_test_standardscaled.csv')

print(train_tsne.shape, val_tsne.shape, test_tsne.shape)

(113842, 3) (28462, 3) (142503, 3)


In [9]:
train_tsne = train_tsne.drop(train_tsne.columns[0], axis=1)
val_tsne = val_tsne.drop(val_tsne.columns[0], axis=1)
test_tsne = test_tsne.drop(test_tsne.columns[0], axis=1)

In [10]:
train_tsne

Unnamed: 0,new_V1,new_V2
0,7.746193,13.506231
1,-13.503382,18.752907
2,-44.579536,-11.460706
3,1.150573,1.061861
4,-3.577145,23.782250
...,...,...
113837,-6.343715,7.567572
113838,-23.198074,-37.964420
113839,1.654565,8.885488
113840,8.184764,-23.187050


In [11]:
merged_train = pd.concat([train_df, train_tsne],axis=1)
merged_train

Unnamed: 0,ID,new_V1,new_V2
0,3,7.746193,13.506231
1,4,-13.503382,18.752907
2,6,-44.579536,-11.460706
3,8,1.150573,1.061861
4,9,-3.577145,23.782250
...,...,...,...
113837,284796,-6.343715,7.567572
113838,284797,-23.198074,-37.964420
113839,284798,1.654565,8.885488
113840,284802,8.184764,-23.187050


In [12]:
merged_val = pd.concat([val_df, val_tsne],axis=1)
merged_val

Unnamed: 0,ID,Class,new_V1,new_V2
0,10,0,74.721570,0.658578
1,22,0,27.785915,4.205872
2,63,0,-2.855847,10.725785
3,69,0,27.560820,5.214900
4,83,0,52.191250,36.171192
...,...,...,...,...
28457,284769,0,-38.770730,-42.368740
28458,284779,0,-23.907887,27.588938
28459,284790,0,1.223432,-47.611904
28460,284801,0,-6.396180,2.821916


In [13]:
merged_val = merged_val[['ID', 'new_V1', 'new_V2','Class']]
merged_val


Unnamed: 0,ID,new_V1,new_V2,Class
0,10,74.721570,0.658578,0
1,22,27.785915,4.205872,0
2,63,-2.855847,10.725785,0
3,69,27.560820,5.214900,0
4,83,52.191250,36.171192,0
...,...,...,...,...
28457,284769,-38.770730,-42.368740,0
28458,284779,-23.907887,27.588938,0
28459,284790,1.223432,-47.611904,0
28460,284801,-6.396180,2.821916,0


In [14]:
merged_test = pd.concat([test, test_tsne],axis=1)
merged_test

Unnamed: 0,ID,new_V1,new_V2
0,AAAA0x1,1.246201,19.423101
1,AAAA0x2,23.918526,-12.792903
2,AAAA0x5,10.191815,-15.282252
3,AAAA0x7,15.172346,-24.813463
4,AAAA0xc,11.220246,32.764710
...,...,...,...
142498,0x4587f,-19.024464,-2.527078
142499,0x45880,13.558858,17.063470
142500,0x45884,16.588413,4.569261
142501,0x45885,27.502903,18.096945


In [15]:
from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import SGDOneClassSVM
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline


In [16]:
# Train dataset은 Label이 존재하지 않음
train_x = merged_train.drop(columns=['ID']) # Input Data

In [17]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

## model define, fit

In [18]:
model = EllipticEnvelope(support_fraction = 0.994, contamination = 0.00112, random_state = 42)
model.fit(train_x)
val_x = merged_val.drop(columns=['ID', 'Class']) # Input Data
val_y = merged_val['Class'] # Label

val_pred = model.predict(val_x) # model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

Validation F1 Score : [0.4316131365747393]
              precision    recall  f1-score   support

           0       1.00      0.76      0.86     28432
           1       0.00      0.27      0.00        30

    accuracy                           0.76     28462
   macro avg       0.50      0.51      0.43     28462
weighted avg       1.00      0.76      0.86     28462



## test

In [19]:
test_x = merged_test.drop(columns=['ID'])

In [20]:
#best model : model "EllipticEnvelope"
test_pred = model.predict(test_x) # model prediction
test_pred = get_pred_label(test_pred)

## submission

In [21]:
submit = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/sample_submission.csv')
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,1
1,AAAA0x2,1
2,AAAA0x5,1
3,AAAA0x7,1
4,AAAA0xc,1


In [22]:
submit['Class'] = test_pred
submit.to_csv('./drive/MyDrive/신용카드 사기 데이콘/open/elliptic_tsne_standardscaled_submit.csv', index=False)