In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## 시드고정

In [27]:
import random
import os
import numpy as np
import torch
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(41) # Seed 고정

## Import

In [28]:
import pandas as pd
import numpy as np

from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

In [29]:
train_df = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/train.csv')
val_df = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/val.csv')
test = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/test.csv')

print(train_df.shape, val_df.shape, test.shape)

(113842, 31) (28462, 32) (142503, 31)


In [30]:
from sklearn import svm
from sklearn.datasets import make_moons, make_blobs
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import SGDOneClassSVM
from sklearn.kernel_approximation import Nystroem
from sklearn.pipeline import make_pipeline


In [31]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.preprocessing import StandardScaler
from sklearn.covariance import EllipticEnvelope

In [32]:
# Train dataset은 Label이 존재하지 않음
train_df = train_df.drop(columns=['ID']) # Input Data

In [33]:
from sklearn.preprocessing import StandardScaler
# StandardScaler 선언 및 Fitting
sdscaler = StandardScaler()
sdscaler.fit(train_df)

# 데이터 변환
sdscaled_data = sdscaler.transform(train_df)

# 데이터 프레임으로 저장
sdscaled_data = pd.DataFrame(sdscaled_data)

In [34]:
sdscaled_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-0.696317,-0.812481,1.178089,0.271798,-0.368309,1.349308,0.652511,0.210988,-1.380804,0.188955,...,0.345210,1.066111,1.431834,-1.136246,-0.633230,-0.288586,-0.137969,-0.166355,1.188563,-1.990839
1,-0.495358,-0.112967,1.191305,-0.608173,-0.007886,0.933476,0.192541,0.320444,-1.264291,-0.056156,...,-0.148280,0.007285,-0.297148,-1.938909,1.241563,-0.460803,0.157587,0.173998,0.143876,-1.990839
2,-0.218427,0.580982,0.755819,-0.116154,0.307498,-0.026206,0.390690,0.221649,-0.517216,-0.351540,...,-0.286720,-0.773425,-0.039431,-0.611606,-0.450853,0.220818,0.635984,0.229098,-0.346737,-1.990818
3,-0.330318,0.858041,0.711241,-0.345477,0.693558,0.317890,0.925884,-3.209968,0.563699,1.161326,...,2.693508,-1.402900,0.092477,-1.070930,-0.801717,-0.106743,-3.020466,-3.046182,-0.194717,-1.990712
4,-0.458462,0.172537,-0.082109,-0.189262,1.951788,2.793282,0.302611,0.719980,-0.355977,-0.387965,...,-0.099976,-0.370382,-0.319020,1.671161,0.714378,-0.798093,0.029990,0.401295,0.019821,-1.990712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113837,-6.415477,6.169704,-5.669274,-1.774215,-3.354334,-1.051879,-3.021567,4.640268,4.468824,8.074160,...,-1.306812,-2.162159,1.402375,-2.067159,3.432320,0.667510,5.233857,3.463283,-0.321352,1.645815
113838,0.965967,-0.087719,-0.674497,1.069597,-0.026160,-0.465046,0.153202,-0.208020,0.610333,0.107995,...,0.201179,0.876793,-0.064140,-0.086360,0.605158,-0.958774,0.046304,-0.113892,-0.116108,1.645857
113839,-0.124097,0.430608,0.260596,-0.325094,0.178465,-1.013702,0.767042,-0.171877,0.098905,-0.270613,...,-0.315282,-0.710635,0.441651,0.614549,-1.078550,0.235849,0.329761,0.229617,-0.339285,1.645899
113840,0.061566,0.563104,-0.371251,-0.524503,0.826191,-0.181210,0.670168,0.099151,-0.184366,-0.618512,...,-0.433467,-1.117010,0.081219,0.171137,-0.841334,0.258583,0.546111,0.194625,-0.350749,1.645962


In [35]:
def get_pred_label(model_pred):
    # IsolationForest 모델 출력 (1:정상, -1:불량(사기)) 이므로 (0:정상, 1:불량(사기))로 Label 변환
    model_pred = np.where(model_pred == 1, 0, model_pred)
    model_pred = np.where(model_pred == -1, 1, model_pred)
    return model_pred

In [36]:
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data

In [37]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 선언 및 Fitting
StandardScaler = StandardScaler()
StandardScaler.fit(val_x)

# 데이터 변환
val_StandardScaled_data = StandardScaler.transform(val_x)

# 데이터 프레임으로 저장
val_StandardScaled_data = pd.DataFrame(val_StandardScaled_data)

## model define, fit

In [38]:
model = EllipticEnvelope(support_fraction = 0.994, contamination = 0.00112, random_state = 42)
model.fit(sdscaled_data)
val_x = val_df.drop(columns=['ID', 'Class']) # Input Data
val_y = val_df['Class'] # Label

val_pred = model.predict(val_x) # model prediction
val_pred = get_pred_label(val_pred)
val_score = f1_score(val_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_y, val_pred))

Validation F1 Score : [0.9165787375726882]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.83      0.83      0.83        30

    accuracy                           1.00     28462
   macro avg       0.92      0.92      0.92     28462
weighted avg       1.00      1.00      1.00     28462



## test

In [39]:
test_x = test.drop(columns=['ID'])

In [40]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 선언 및 Fitting
StandardScaler = StandardScaler()
StandardScaler.fit(test_x)

# 데이터 변환
test_StandardScaled_data = StandardScaler.transform(test_x)

# 데이터 프레임으로 저장
test_StandardScaled_data = pd.DataFrame(test_StandardScaled_data)

In [41]:
#best model : model "EllipticEnvelope"
test_pred = model.predict(test_StandardScaled_data) # model prediction
test_pred = get_pred_label(test_pred)

## submission

In [42]:
submit = pd.read_csv('./drive/MyDrive/신용카드 사기 데이콘/open/sample_submission.csv')
submit.head()

Unnamed: 0,ID,Class
0,AAAA0x1,1
1,AAAA0x2,1
2,AAAA0x5,1
3,AAAA0x7,1
4,AAAA0xc,1


In [43]:
submit['Class'] = test_pred
submit.to_csv('./drive/MyDrive/신용카드 사기 데이콘/open/elliptic_standardscaling_submit3.csv', index=False)