In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 시드고정

In [9]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(41) # Seed 고정

## Import

In [26]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [27]:
train_df = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/train.csv')
val_df = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/val.csv')
test = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/test.csv')

print(train_df.shape, val_df.shape, test.shape)

(113842, 31) (28462, 32) (142503, 31)


In [28]:
from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import SGDOneClassSVM
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline


In [31]:
# Train dataset은 Label이 존재하지 않음
train_x = train_df.drop(columns=['ID']) # Input Data

In [32]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

## model define, fit

In [33]:
model = EllipticEnvelope(support_fraction = 0.994, contamination = 0.00112, random_state = 42)
model.fit(train_x)
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label

val_pred = model.predict(val_x) # model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

Validation F1 Score : [0.9236496787663914]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.86      0.83      0.85        30

    accuracy                           1.00     28462
   macro avg       0.93      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462



## test

In [34]:
test_x = test.drop(columns=['ID'])

In [35]:
#best model : model "EllipticEnvelope"
test_pred = model.predict(test_x) # model prediction
test_pred = get_pred_label(test_pred)

## submission

In [37]:
submit = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/sample_submission.csv')
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,1
1,AAAA0x2,1
2,AAAA0x5,1
3,AAAA0x7,1
4,AAAA0xc,1


In [38]:
submit['Class'] = test_pred
submit.to_csv('./drive/MyDrive/신용카드 사기 데이콘/open/elliptic_submit.csv', index=False)