### Baseline Model (Ham Veri)
Feature engineering yapmadan, ham veriyle ve stratified %70-15-15 ayrımıyla LGBM modeli kurdum.

0.886 ROC-AUC skoru elde ederek, ilerideki iyileştirmeler için temel bir referans noktası (baseline) oluşturdum.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, classification_report

In [2]:
df = pd.read_csv('../data/raw/creditcard.csv')

In [3]:
X = df.drop(['Class', 'Time'], axis=1)

In [4]:
y = df['Class']

In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    test_size=0.3,   # %30'u kenara ayiriyoruz
    random_state=42, 
    stratify=y       # Fraud oranini koru
)

In [6]:
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, 
    test_size=0.5,   # Kalanin yarisi Test olsun
    random_state=42, 
    stratify=y_temp  # Burada da Fraud oranini koru
)

In [7]:
print(f"Train Seti Boyutu : {X_train.shape[0]} satır")
print(f"Val Seti Boyutu   : {X_val.shape[0]} satır")
print(f"Test Seti Boyutu  : {X_test.shape[0]} satır")

Train Seti Boyutu : 199364 satır
Val Seti Boyutu   : 42721 satır
Test Seti Boyutu  : 42722 satır


In [8]:
model = LGBMClassifier(
    n_estimators=1000,  # Maksimum ağaç sayisi (Erken durdurma yapacagimiz icin yuksek verdik)
    learning_rate=0.05,
    random_state=42,
    verbose=-1          # Gereksiz uyarilari gizle
)

In [9]:
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",          
)

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,1000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [10]:
y_prob = model.predict_proba(X_test)[:, 1]
test_score = roc_auc_score(y_test, y_prob)

print(f"\n--- Final Test ROC-AUC Skoru: {test_score:.5f} ---")


--- Final Test ROC-AUC Skoru: 0.88575 ---
