In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, average_precision_score

In [3]:
df = pd.read_csv('creditcard.csv')

In [None]:
# 데이터 구조 확인
print("--- Data Info ---")
print(df.info())
print("\n--- Class Distribution (Original) ---")
print(df['Class'].value_counts())
print("비율:\n", df['Class'].value_counts(normalize=True))

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float

## 2. 샘플링

In [5]:
# 사기 거래(Class=1)와 정상 거래(Class=0) 분리
fraud = df[df['Class'] == 1]
normal = df[df['Class'] == 0]

In [7]:
normal_smple = normal.sample(n=10000, random_state=42)

In [12]:
# dataset 병합
df_new = pd.concat([fraud, normal_smple])

In [11]:
print("\n--- Class Distribution (After Sampling) ---")
print(df_new['Class'].value_counts())


--- Class Distribution (After Sampling) ---
Class
0    10000
1      492
Name: count, dtype: int64


## 3. 데이터 전처리

In [14]:
# Amount 변수 표준화
scaler = StandardScaler()

df_new['Amount_Scaled'] = scaler.fit_transform(df_new[['Amount']])

In [15]:
# 원본 Amount 제거
df_new = df_new.drop('Amount', axis=1)

# X, y 분리
X = df_new.drop('Class', axis=1)
y = df_new['Class']

In [16]:
df_new

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Class,Amount_Scaled
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,1,-0.394650
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,1,1.976514
4920,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,1,0.680800
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,1,-0.130191
6329,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1,-0.390168
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
218680,141415.0,-0.762961,1.897243,1.931378,4.191413,0.103570,1.367957,-0.210296,0.799408,-1.949474,...,-0.217645,-0.639138,-0.096265,0.417441,0.001403,0.194527,0.236362,0.106503,0,-0.391244
239359,150069.0,-0.299711,1.079933,-0.500521,-0.571127,1.362166,-0.241336,1.061852,-0.055889,0.025168,...,-0.008621,0.287652,-0.302456,-0.025240,-0.037041,0.588618,0.369017,0.266397,0,-0.278154
262759,160634.0,2.129101,-0.873931,-1.635981,-1.176035,-0.073736,-0.412121,-0.289237,-0.223462,-0.776604,...,-0.034599,-0.262403,0.091163,-1.095939,-0.098260,-0.387646,-0.046397,-0.065703,0,-0.067259
62511,50297.0,1.127518,0.118124,0.339852,0.599886,-0.359735,-0.421149,-0.161974,0.141529,-0.043080,...,-0.201777,-0.683207,0.175118,0.147933,0.035407,0.097054,-0.021608,0.018880,0,-0.322977


In [24]:
# 4. 학습/테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [25]:
print("\n--- Train/Test Shape ---")
print(f"Train: {X_train.shape}, Test: {X_test.shape}")
print("Train Class Ratio:\n", y_train.value_counts(normalize=True))


--- Train/Test Shape ---
Train: (8393, 30), Test: (2099, 30)
Train Class Ratio:
 Class
0    0.953056
1    0.046944
Name: proportion, dtype: float64


In [26]:
# 5. SMOTE 
# SMOTE 객체 생성
smote = SMOTE(random_state=42)

In [28]:
# 학습 데이터(X_train)에만 오버샘플링 적용
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\n--- Before vs After SMOTE (Fraud counts) ---")
print(f"Before SMOTE (Class 1): {sum(y_train==1)}")
print(f"After SMOTE (Class 1): {sum(y_train_res==1)}")


--- Before vs After SMOTE (Fraud counts) ---
Before SMOTE (Class 1): 394
After SMOTE (Class 1): 7999


In [29]:
# 6. 모델 학습 및 평가
# 모델 학습
model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.fit(X_train_res, y_train_res)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [30]:
# 예측
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [31]:
# 평가 지표 출력
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred))

pr_auc = average_precision_score(y_test, y_pred_proba)
print(f"PR-AUC Score: {pr_auc:.4f}")


--- Classification Report ---
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2001
           1       0.95      0.89      0.92        98

    accuracy                           0.99      2099
   macro avg       0.97      0.94      0.96      2099
weighted avg       0.99      0.99      0.99      2099

PR-AUC Score: 0.9538
