In [4]:
#k-fold cross-validation
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import joblib
X_train_bal_scaled, y_train_bal = joblib.load('C:/Users/kamil/Documents/pythonProject1/fraud_detection/data/processed/train.pkl')
rf = RandomForestClassifier(random_state=42)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    rf,
    X_train_bal_scaled,
    y_train_bal,
    scoring='f1',
    cv=cv,
    n_jobs=1
)

print("Cross-val F1-scores:", scores)
print("Mean F1:", scores.mean())

Cross-val F1-scores: [0.99999217 0.99998826 0.99997652 0.99998043 0.99997261]
Mean F1: 0.9999819981599998


In [12]:
#NN with dropout, earlystopping
'''
Dropout randomly resets some neurons during training to 
prevent the model from getting used to the training set
'''
from tensorflow.keras import layers, models
import joblib
X_val_scaled, y_val = joblib.load('C:/Users/kamil/Documents/pythonProject1/fraud_detection/data/processed/val.pkl')

model = models.Sequential()
model.add(layers.Dense(16, activation='relu', input_shape=(X_train_bal_scaled.shape[1],)))
model.add(layers.Dropout(0.2))  # 20% dropout
model.add(layers.Dense(8, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

'''
EarlyStopping callback
stops training when the model stops improving on the validation set.
'''
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train_bal_scaled, y_train_bal,
    validation_data=(X_val_scaled, y_val),
    epochs=100,
    batch_size=32,
    callbacks=[es]
)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m39928/39928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 1ms/step - accuracy: 0.9511 - loss: 0.1307 - val_accuracy: 0.9836 - val_loss: 0.0388
Epoch 2/100
[1m39928/39928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9844 - loss: 0.0459 - val_accuracy: 0.9893 - val_loss: 0.0264
Epoch 3/100
[1m39928/39928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9862 - loss: 0.0412 - val_accuracy: 0.9872 - val_loss: 0.0303
Epoch 4/100
[1m39928/39928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9870 - loss: 0.0391 - val_accuracy: 0.9885 - val_loss: 0.0285
Epoch 5/100
[1m39928/39928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9873 - loss: 0.0387 - val_accuracy: 0.9880 - val_loss: 0.0314
Epoch 6/100
[1m39928/39928[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1ms/step - accuracy: 0.9875 - loss: 0.0384 - val_accuracy: 0.9891 - val_loss: 0.0273
Epoc

In [18]:
#anomaly / novelty detection / isolation forest
'''
used isolation forest because model will treat "unusual" 
transactions like anomaly
'''
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, classification_report

iso_forest = IsolationForest(random_state=42, n_estimators=100)
iso_forest.fit(X_train_bal_scaled)

#prediction: +1 = normal, -1 =anomaly
y_val_pred_if = iso_forest.predict(X_val_scaled)

# changing -1 to 1 (fraud) and +1 to 0 (normal), to compare to y_val
y_val_pred_if = [1 if x == -1 else 0 for x in y_val_pred_if]

print(confusion_matrix(y_val, y_val_pred_if))
print(classification_report(y_val, y_val_pred_if))

[[144730  37771]
 [ 14815   2664]]
              precision    recall  f1-score   support

         0.0       0.91      0.79      0.85    182501
         1.0       0.07      0.15      0.09     17479

    accuracy                           0.74    199980
   macro avg       0.49      0.47      0.47    199980
weighted avg       0.83      0.74      0.78    199980



In [5]:
#building final model// rf was 1.0 so :) 
import joblib

X_train_bal_scaled, y_train_bal = joblib.load('C:/Users/kamil/Documents/pythonProject1/fraud_detection/data/processed/train.pkl')

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'n_estimators':[50,100,200],'max_depth':[None,5,10]}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf, param_grid=param_grid, scoring='f1', cv=3, n_jobs=-1)
grid_search.fit(X_train_bal_scaled, y_train_bal)
print(grid_search.best_params_, grid_search.best_score_)

{'max_depth': None, 'n_estimators': 200} 0.9999569498123181


In [6]:
best_params = grid_search.best_params_
best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf.fit(X_train_bal_scaled, y_train_bal)

In [8]:
import joblib
joblib.dump(best_rf, "C:/Users/kamil/Documents/pythonProject1/fraud_detection/models/best_random_forest.pkl")

['C:/Users/kamil/Documents/pythonProject1/fraud_detection/models/best_random_forest.pkl']