In [15]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, roc_auc_score
import matplotlib.pyplot as plt

# 1. 데이터 로딩
train_transaction = pd.read_csv("train_transaction.csv")
train_identity = pd.read_csv("train_identity.csv")

# 2. 병합
df = pd.merge(train_transaction, train_identity, on="TransactionID", how="left")

# 3. 결측치 간단 처리 (예: 결측치 0으로 채움)
df.fillna(0, inplace=True)

# 4. 라벨 분리
y = df["isFraud"].values  # 사기 여부 (0 또는 1)

# 5. 불필요한 열 제거 (예: ID 또는 날짜열 등), 여기서는 간단히 숫자형만 사용
X = df.select_dtypes(include=[np.number]).drop(columns=["TransactionID", "isFraud"])

# 6. 정규화
X_scaled = StandardScaler().fit_transform(X)

# 7. PCA 축소
X_pca = PCA(n_components=5, random_state=42).fit_transform(X_scaled)

# 8. LOF 적용 (비지도학습)
lof = LocalOutlierFactor(n_neighbors=20, novelty=False)
lof.fit(X_pca)
lof_decision_scores = -lof.negative_outlier_factor_

# 9. 이상치 예측 (상위 5%를 이상치로 판단)
threshold = np.percentile(lof_decision_scores, 95)
y_pred = (lof_decision_scores > threshold).astype(int)

# 10. 평가 출력
print("📊 Classification Report:")
print(classification_report(y, y_pred, target_names=["Not Fraud", "Fraud"]))
print(f"🎯 ROC AUC Score: {roc_auc_score(y, lof_decision_scores):.4f}")


📊 Classification Report:
              precision    recall  f1-score   support

   Not Fraud       0.98      0.95      0.97      1469
       Fraud       0.05      0.13      0.08        31

    accuracy                           0.93      1500
   macro avg       0.52      0.54      0.52      1500
weighted avg       0.96      0.93      0.95      1500

🎯 ROC AUC Score: 0.6512
