In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
%pip install mlflow
%pip install dagshub

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
import mlflow
import mlflow.sklearn
import time
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, confusion_matrix, classification_report,
    precision_recall_curve, roc_curve, average_precision_score
)
import dagshub
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [4]:
# Initialize MLflow tracking
try:
    # Initialize Dagshub
    dagshub.init(repo_owner='konstantine25b', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)
    print("DagsHub initialized successfully.")
    mlflow.set_experiment("IEEE-CIS-Fraud-Detection_xgboost")
    print(f"MLflow experiment set to: {mlflow.get_experiment_by_name('IEEE-CIS-Fraud-Detection_xgboost').name}")
    mlflow_active = True
except Exception as e:
    print(f"Could not initialize DagsHub or set MLflow experiment: {e}")
    print("Proceeding without MLflow tracking.")
    mlflow_active = False

DagsHub initialized successfully.
MLflow experiment set to: IEEE-CIS-Fraud-Detection_xgboost


In [5]:
# Start MLflow run
run_name = f"xgboost_{time.strftime('%Y%m%d_%H%M%S')}"
if mlflow_active:
    mlflow.start_run(run_name=run_name)
    print(f"MLflow run started with name: {run_name}")

MLflow run started with name: xgboost_20250420_131942


In [None]:
print("\n--- Loading Original Data from Kaggle ---")
try:
    identity_df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
    transaction_df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
    print(f"Loaded identity data shape: {identity_df.shape}")
    print(f"Loaded transaction data shape: {transaction_df.shape}")
except FileNotFoundError:
    print("Error: One or both of the CSV files were not found. Please make sure the file paths are correct.")
    if mlflow_active:
        mlflow.end_run()
    exit()


--- Loading Original Data from Kaggle ---
Loaded identity data shape: (144233, 41)
Loaded transaction data shape: (590540, 394)


In [None]:
# Load the preprocessing pipeline from MLflow
print("\n--- Loading Preprocessing Pipeline from MLflow ---")
# Replace with your actual run ID from the preprocessing pipeline
preprocessing_run_id = '962cdbe1451f4abe864ff349e123e7de'  # Example run ID
try:
    # Load the transaction pipeline model
    transaction_pipeline = mlflow.sklearn.load_model(f'runs:/{preprocessing_run_id}/transaction_pipeline_model')
    print("Loaded transaction preprocessing pipeline from MLflow.")
    
    # Load the identity pipeline model if it exists
    try:
        identity_pipeline = mlflow.sklearn.load_model(f'runs:/{preprocessing_run_id}/identity_pipeline_model')
        print("Loaded identity preprocessing pipeline from MLflow.")
        identity_pipeline_exists = True
    except:
        print("Identity preprocessing pipeline not found. Will only use transaction pipeline.")
        identity_pipeline_exists = False
    
    # Load the feature selection information
    try:
        selected_features = mlflow.artifacts.load_text(f'runs:/{preprocessing_run_id}/selected_features.txt').strip().split('\n')
        print(f"Loaded {len(selected_features)} selected features from MLflow.")
    except:
        print("Selected features list not found. Will use all features after preprocessing.")
        selected_features = None
except Exception as e:
    print(f"Error loading preprocessing pipeline from MLflow: {e}")
    print("Please ensure you have the correct run ID and the pipeline is properly saved.")
    if mlflow_active:
        mlflow.end_run()
    exit()

# Split the data
X_transaction = transaction_df.drop('isFraud', axis=1)
y_transaction = transaction_df['isFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X_transaction, y_transaction, test_size=0.2, random_state=42, stratify=y_transaction
)
print(f"Train set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")



--- Loading Preprocessing Pipeline from MLflow ---


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Loaded transaction preprocessing pipeline from MLflow.


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Loaded identity preprocessing pipeline from MLflow.


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Loaded 50 selected features from MLflow.
Train set shape: (472432, 393)
Test set shape: (118108, 393)


In [None]:
# Get the original feature names before preprocessing
original_feature_names = X_train.columns.tolist()
print(f"Original feature count: {len(original_feature_names)}")

# Apply the transaction preprocessing pipeline
print("Applying transaction preprocessing pipeline...")
X_train_processed = transaction_pipeline.transform(X_train)
X_test_processed = transaction_pipeline.transform(X_test)

# Convert to DataFrame with feature names that match the preprocessing pipeline
# First, use generic names
feature_names = [f"feature_{i}" for i in range(X_train_processed.shape[1])]
X_train_final = pd.DataFrame(X_train_processed, columns=feature_names)
X_test_final = pd.DataFrame(X_test_processed, columns=feature_names)


Original feature count: 393
Applying transaction preprocessing pipeline...


In [9]:
# Apply feature selection if available
if selected_features is not None:
    print(f"Applying feature selection to keep {len(selected_features)} features...")
    
    # Print some of the selected feature names to understand their format
    print(f"Sample selected feature names: {selected_features[:5]}")
    
    # Check if any of the selected features match the original feature names
    original_matches = [f for f in selected_features if f in original_feature_names]
    if original_matches:
        print(f"Found {len(original_matches)} features that match original feature names.")
        
        # Create a mapping from original feature names to processed feature indices
        # This is a simplified approach - in reality, the mapping might be more complex
        feature_mapping = {}
        for i, feature in enumerate(original_feature_names):
            if i < len(feature_names):
                feature_mapping[feature] = feature_names[i]
        
        # Map selected features to their corresponding processed features
        mapped_features = []
        for feature in selected_features:
            if feature in feature_mapping:
                mapped_features.append(feature_mapping[feature])
        
        if mapped_features:
            print(f"Mapped {len(mapped_features)} selected features to processed features.")
            
            # Apply the selection
            X_train_final = X_train_final[mapped_features]
            X_test_final = X_test_final[mapped_features]
        else:
            print("Could not map any selected features to processed features.")
            print("Using all processed features.")
    else:
        # If selected features don't match original names, they might be indices or have a different format
        print("Selected features don't match original feature names.")
        
        # Print all selected features to understand their format
        print(f"First 5 selected features: {selected_features[:5]}")
        print(f"Last 5 selected features: {selected_features[-5:]}")
        
        # Check what columns are actually available in the processed data
        print(f"Available columns in processed data: {X_train_final.columns[:10]}...")
        
        # Extract the indices from the selected feature names
        feature_indices = []
        for feature in selected_features:
            # Try different patterns to extract indices
            if '_x' in feature or '_y' in feature:
                # Format like '0_x' or '1_y'
                parts = feature.split('_')
                if parts[0].isdigit():
                    feature_indices.append(int(parts[0]))
            elif feature.isdigit():
                # Format like '68'
                feature_indices.append(int(feature))
        
        # Get the available columns in the processed data
        available_columns = X_train_final.columns.tolist()
        
        # Create a new DataFrame with the original feature names
        X_train_renamed = pd.DataFrame()
        X_test_renamed = pd.DataFrame()
        
        # Map each selected feature to a column in the processed data
        for i, feature in enumerate(selected_features):
            if i < len(available_columns):
                # Use the i-th column from the processed data for the i-th selected feature
                # Extract as a Series using .iloc to avoid the DataFrame issue
                X_train_renamed[feature] = X_train_final.iloc[:, i].values
                X_test_renamed[feature] = X_test_final.iloc[:, i].values
            else:
                print(f"Warning: Not enough columns in processed data for feature {feature}")
        
        # Use the renamed DataFrames
        X_train_final = X_train_renamed
        X_test_final = X_test_renamed
        
        print(f"Final train set shape after preprocessing: {X_train_final.shape}")
        print(f"Final test set shape after preprocessing: {X_test_final.shape}")
        print(f"Feature names in final dataset: {X_train_final.columns.tolist()[:5]}...")
else:
    print("No selected features provided. Using all features from the preprocessing pipeline.")

print(f"Final train set shape after preprocessing: {X_train_final.shape}")
print(f"Final test set shape after preprocessing: {X_test_final.shape}")


Applying feature selection to keep 50 features...
Sample selected feature names: ['0_x', '1_x', '2_x', '3_x', '4_x']
Selected features don't match original feature names.
First 5 selected features: ['0_x', '1_x', '2_x', '3_x', '4_x']
Last 5 selected features: ['6_y', '7_y', '8_y', '30_y', '31_y']
Available columns in processed data: Index(['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9'],
      dtype='object')...
Final train set shape after preprocessing: (472432, 50)
Final test set shape after preprocessing: (118108, 50)
Feature names in final dataset: ['0_x', '1_x', '2_x', '3_x', '4_x']...
Final train set shape after preprocessing: (472432, 50)
Final test set shape after preprocessing: (118108, 50)


In [10]:

# Check for NaN values in the processed data
print("\n--- Checking for NaN Values in Processed Data ---")
train_nan_count = X_train_final.isna().sum().sum()
test_nan_count = X_test_final.isna().sum().sum()

print(f"Number of NaN values in training data: {train_nan_count}")
print(f"Number of NaN values in testing data: {test_nan_count}")

if train_nan_count > 0 or test_nan_count > 0:
    print("WARNING: NaN values found in processed data!")
    
    # Get columns with NaN values
    train_nan_cols = X_train_final.columns[X_train_final.isna().any()].tolist()
    test_nan_cols = X_test_final.columns[X_test_final.isna().any()].tolist()
    
    if train_nan_cols:
        print(f"Training data columns with NaN values: {train_nan_cols}")
        print(f"NaN counts per column: \n{X_train_final[train_nan_cols].isna().sum()}")
    
    if test_nan_cols:
        print(f"Testing data columns with NaN values: {test_nan_cols}")
        print(f"NaN counts per column: \n{X_test_final[test_nan_cols].isna().sum()}")
    
    # Fill NaN values with median as a quick fix
    print("Filling NaN values with column medians...")
    for col in train_nan_cols:
        median_val = X_train_final[col].median()
        X_train_final[col] = X_train_final[col].fillna(median_val)
    
    for col in test_nan_cols:
        if col in X_train_final.columns:
            median_val = X_train_final[col].median()
        else:
            median_val = X_test_final[col].median()
        X_test_final[col] = X_test_final[col].fillna(median_val)
    
    # Verify NaN values are gone
    train_nan_count_after = X_train_final.isna().sum().sum()
    test_nan_count_after = X_test_final.isna().sum().sum()
    print(f"NaN values after filling - training data: {train_nan_count_after}")
    print(f"NaN values after filling - testing data: {test_nan_count_after}")
else:
    print("No NaN values found in processed data. Preprocessing pipeline handled missing values correctly.")



--- Checking for NaN Values in Processed Data ---
Number of NaN values in training data: 0
Number of NaN values in testing data: 0
No NaN values found in processed data. Preprocessing pipeline handled missing values correctly.


In [11]:
# Log preprocessing information
if mlflow_active:
    mlflow.log_param("original_features", X_train.shape[1])
    mlflow.log_param("final_features", X_train_final.shape[1])


# Setting up Cross-Validation and Hyperparameter Tuning

In [26]:
# Define the pipeline with scaler and XGBoost classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', xgb.XGBClassifier(
        objective='binary:logistic',
        random_state=42,
        use_label_encoder=False,
        eval_metric='auc'
    ))
])
param_grid = {
    'classifier__n_estimators': [200, 300],  # More trees for better learning
    'classifier__max_depth': [4, 8],  # Try deeper trees to capture complex patterns
    'classifier__learning_rate': [0.05, 0.01],  # Slower learning rate for better convergence
    'classifier__scale_pos_weight': [25, 45],  # Better balance for positive class
    'classifier__min_child_weight': [3, 5],  # Better control of overfitting
    'classifier__subsample': [0.7],  # Prevent overfitting
    'classifier__colsample_bytree': [0.8],  # Feature subsampling
    'classifier__gamma': [0.1],  # Minimum loss reduction for split
    'classifier__reg_alpha': [0.1, 1.0],  # L1 regularization
    'classifier__reg_lambda': [1.0, 10.0]  # L2 regularization
}

In [27]:
# Reduce cross-validation folds
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Sample 10% of the training data for hyperparameter tuning (increased from 7%)
sample_size = int(0.10 * len(X_train_final))
indices = np.random.choice(len(X_train_final), sample_size, replace=False)
X_train_sample = X_train_final.iloc[indices]
y_train_sample = y_train.iloc[indices]

# Use the sampled data for grid search
print(f"Using {len(X_train_sample)} samples for hyperparameter tuning...")
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=cv, 
    scoring='f1',  # Optimize for F1 score which balances precision and recall
    verbose=1,
    n_jobs=-1
)
grid_search.fit(X_train_sample, y_train_sample)

Using 47243 samples for hyperparameter tuning...
Fitting 5 folds for each of 128 candidates, totalling 640 fits


In [28]:
# After finding best parameters, train final model on full dataset
print("Training final model with best parameters on full dataset...")
best_params = grid_search.best_params_

# Create a new XGBoost classifier with the best parameters
best_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=best_params['classifier__n_estimators'],
    max_depth=best_params['classifier__max_depth'],
    learning_rate=best_params['classifier__learning_rate'],
    scale_pos_weight=best_params['classifier__scale_pos_weight'],
    min_child_weight=best_params.get('classifier__min_child_weight', 1),
    subsample=best_params.get('classifier__subsample', 0.8),
    colsample_bytree=best_params.get('classifier__colsample_bytree', 0.8),
    gamma=best_params.get('classifier__gamma', 0),
    reg_alpha=best_params.get('classifier__reg_alpha', 0),
    reg_lambda=best_params.get('classifier__reg_lambda', 1),
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc',
    n_jobs=-1
)

Training final model with best parameters on full dataset...


In [29]:
# Use a validation set for early stopping in the final model
X_train_fit, X_val, y_train_fit, y_val = train_test_split(
    X_train_final, y_train, test_size=0.2, random_state=42, stratify=y_train
)

# Apply StandardScaler to the data
scaler = StandardScaler()
X_train_fit_scaled = scaler.fit_transform(X_train_fit)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_final)

# Train the model with more early stopping rounds
best_xgb.fit(
    X_train_fit_scaled, y_train_fit,
    eval_set=[(X_val_scaled, y_val)],
    verbose=True,  # Show progress
    early_stopping_rounds=100  # More patience for early stopping
)



[0]	validation_0-auc:0.85134
[1]	validation_0-auc:0.86962
[2]	validation_0-auc:0.87397
[3]	validation_0-auc:0.87770
[4]	validation_0-auc:0.88075
[5]	validation_0-auc:0.88248
[6]	validation_0-auc:0.88291
[7]	validation_0-auc:0.88320
[8]	validation_0-auc:0.88374
[9]	validation_0-auc:0.88407
[10]	validation_0-auc:0.88466
[11]	validation_0-auc:0.88515
[12]	validation_0-auc:0.88535
[13]	validation_0-auc:0.88572
[14]	validation_0-auc:0.88614
[15]	validation_0-auc:0.88662
[16]	validation_0-auc:0.88727
[17]	validation_0-auc:0.88908
[18]	validation_0-auc:0.88912
[19]	validation_0-auc:0.88917
[20]	validation_0-auc:0.89050
[21]	validation_0-auc:0.89084
[22]	validation_0-auc:0.89091
[23]	validation_0-auc:0.89125
[24]	validation_0-auc:0.89163
[25]	validation_0-auc:0.89195
[26]	validation_0-auc:0.89215
[27]	validation_0-auc:0.89324
[28]	validation_0-auc:0.89390
[29]	validation_0-auc:0.89477
[30]	validation_0-auc:0.89539
[31]	validation_0-auc:0.89587
[32]	validation_0-auc:0.89616
[33]	validation_0-au

In [30]:
# Find optimal threshold for better precision-recall balance
print("\n--- Finding Optimal Classification Threshold ---")
y_val_pred_proba = best_xgb.predict_proba(X_val_scaled)[:, 1]
precision_curve, recall_curve, thresholds = precision_recall_curve(y_val, y_val_pred_proba)

# Calculate F1 score for each threshold
f1_scores = 2 * (precision_curve * recall_curve) / (precision_curve + recall_curve + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5
print(f"Optimal threshold: {optimal_threshold:.4f}")
print(f"At optimal threshold - Precision: {precision_curve[optimal_idx]:.4f}, Recall: {recall_curve[optimal_idx]:.4f}, F1: {f1_scores[optimal_idx]:.4f}")

# Use this model for predictions with optimal threshold
model = best_xgb
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_pred_proba >= optimal_threshold).astype(int)



--- Finding Optimal Classification Threshold ---
Optimal threshold: 0.7960
At optimal threshold - Precision: 0.6772, Recall: 0.5913, F1: 0.6314


In [31]:

# Log best parameters and cross-validation scores
if mlflow_active:
    mlflow.log_params(best_params)
    mlflow.log_metric("best_cv_score", grid_search.best_score_)
    
    # Log all CV results
    cv_results = pd.DataFrame(grid_search.cv_results_)
    mlflow.log_text(cv_results.to_string(), "cv_results.txt")


RestException: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}

In [32]:
# Evaluate the model
print("\n--- Evaluating Model Performance ---")

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
avg_precision = average_precision_score(y_test, y_pred_proba)

# Print metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")



--- Evaluating Model Performance ---
Accuracy: 0.9762
Precision: 0.6805
Recall: 0.6039
F1 Score: 0.6399
ROC AUC: 0.9436
Average Precision: 0.6762


In [33]:
# Log metrics
if mlflow_active:
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("avg_precision", avg_precision)

# Print classification report
print("\nClassification Report:")
report = classification_report(y_test, y_pred)
print(report)

# Log classification report
if mlflow_active:
    mlflow.log_text(report, "classification_report.txt")



Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    113975
           1       0.68      0.60      0.64      4133

    accuracy                           0.98    118108
   macro avg       0.83      0.80      0.81    118108
weighted avg       0.98      0.98      0.98    118108



In [34]:
# End MLflow run
if mlflow_active:
    mlflow.end_run()
    print("MLflow run completed and artifacts logged.")

🏃 View run xgboost_20250420_131942 at: https://dagshub.com/konstantine25b/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/8/runs/229c272df1ec46f9b7e97ded0942dc53
🧪 View experiment at: https://dagshub.com/konstantine25b/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/8
MLflow run completed and artifacts logged.
