In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [4]:
%pip install mlflow
%pip install dagshub

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [5]:
import xgboost as xgb
import mlflow
import mlflow.xgboost
import mlflow.sklearn
import time
import warnings
import gc
import os
import dagshub
from sklearn.metrics import roc_auc_score, average_precision_score
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# Start timer
start_time = time.time()

print("=== IEEE-CIS Fraud Detection - Test Prediction ===")

# Initialize MLflow tracking
try:
    # Initialize Dagshub
    dagshub.init(repo_owner='konstantine25b', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)
    print("DagsHub initialized successfully.")
    mlflow.set_experiment("IEEE-CIS Fraud Detection_Test_Prediction")
    print(f"MLflow experiment set to: {mlflow.get_experiment_by_name('IEEE-CIS Fraud Detection_Test_Prediction').name}")
    mlflow_active = True
except Exception as e:
    print(f"Could not initialize DagsHub or set MLflow experiment: {e}")
    print("Proceeding without MLflow tracking.")
    mlflow_active = False

=== IEEE-CIS Fraud Detection - Test Prediction ===


Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=563c9efb-b3f4-442a-a93b-526ba60be81b&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b279f1369eb140a0b74a8b2d5667191ac3392e630f839eae0d1826ff2789c0c1




DagsHub initialized successfully.


2025/04/21 11:23:24 INFO mlflow.tracking.fluent: Experiment with name 'IEEE-CIS Fraud Detection_Test_Prediction' does not exist. Creating a new experiment.


MLflow experiment set to: IEEE-CIS Fraud Detection_Test_Prediction


In [7]:
# Start MLflow run
run_name = f"test_prediction_{time.strftime('%Y%m%d_%H%M%S')}"
if mlflow_active:
    try:
        mlflow.start_run(run_name=run_name)
        print(f"MLflow run started with name: {run_name}")
    except Exception as e:
        print(f"Could not start MLflow run: {e}")
        mlflow_active = False

MLflow run started with name: test_prediction_20250421_112340


# Loading Test Data

In [23]:
print("\n--- Loading Test Data ---")

# Load test transaction data
print("Loading test transaction data...")
test_transaction = pd.read_csv('../input/ieee-fraud-detection/test_transaction.csv')
print(f"Test transaction data loaded: {test_transaction.shape}")

# Load test identity data
print("Loading test identity data...")
test_identity = pd.read_csv('../input/ieee-fraud-detection/test_identity.csv')
print(f"Test identity data loaded: {test_identity.shape}")



--- Loading Test Data ---
Loading test transaction data...
Test transaction data loaded: (506691, 393)
Loading test identity data...
Test identity data loaded: (141907, 41)


# Load preprocessing pipelines and models from MLflow

In [44]:
print("\n--- Loading Models and Pipelines from MLflow ---")

# Use the specific run ID
run_id = "e75d3cbbcd19426cbe1403e0816c2a80"
print(f"Using run ID: {run_id}")

# Load preprocessing pipelines
print("Loading preprocessing pipelines...")
identity_preprocessing_pipeline = mlflow.sklearn.load_model(
    f"runs:/{run_id}/identity_preprocessing_pipeline"
)
print("Identity preprocessing pipeline loaded")

transaction_preprocessing_pipeline = mlflow.sklearn.load_model(
    f"runs:/{run_id}/transaction_preprocessing_pipeline"
)
print("Transaction preprocessing pipeline loaded")



--- Loading Models and Pipelines from MLflow ---
Using run ID: e75d3cbbcd19426cbe1403e0816c2a80
Loading preprocessing pipelines...


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Identity preprocessing pipeline loaded


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Transaction preprocessing pipeline loaded


In [45]:
# Load identity flagger
identity_flagger = mlflow.sklearn.load_model(
    f"runs:/{run_id}/identity_flagger"
)
print("Identity flagger loaded")

# Load identity splitter
identity_splitter = mlflow.sklearn.load_model(
    f"runs:/{run_id}/identity_splitter"
)
print("Identity splitter loaded")

# Load identity merger
identity_merger = mlflow.sklearn.load_model(
    f"runs:/{run_id}/identity_merger"
)
print("Identity merger loaded")

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Identity flagger loaded


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Identity splitter loaded


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Identity merger loaded


In [46]:
# Load feature pipelines
print("Loading feature pipelines...")
with_identity_feature_pipeline = mlflow.sklearn.load_model(
    f"runs:/{run_id}/with_identity_feature_pipeline"
)
print("With identity feature pipeline loaded")

without_identity_feature_pipeline = mlflow.sklearn.load_model(
    f"runs:/{run_id}/without_identity_feature_pipeline"
)
print("Without identity feature pipeline loaded")

# Load the models
print("Loading XGBoost models...")
with_identity_model = mlflow.xgboost.load_model(
    f"runs:/{run_id}/with_identity_model"
)
print("With identity model loaded")

without_identity_model = mlflow.xgboost.load_model(
    f"runs:/{run_id}/without_identity_model"
)
print("Without identity model loaded")


Loading feature pipelines...


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

With identity feature pipeline loaded


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Without identity feature pipeline loaded
Loading XGBoost models...


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

With identity model loaded


Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Without identity model loaded


# Applying Preprocessing Pipelines

In [50]:
print("Standardizing identity column names...")
identity_columns = test_identity.columns.tolist()
renamed_identity_columns = {}
for col in identity_columns:
    if col.startswith('id-'):
        renamed_identity_columns[col] = col.replace('id-', 'id_')

if renamed_identity_columns:
    print(f"Renaming {len(renamed_identity_columns)} identity columns to match training data format")
    test_identity = test_identity.rename(columns=renamed_identity_columns)
else:
    print("No identity columns need renaming")

# Preprocess identity data
print("Preprocessing identity data...")
try:
    test_identity_preprocessed = identity_preprocessing_pipeline.transform(test_identity)
    print(f"Preprocessed identity data shape: {test_identity_preprocessed.shape}")
except Exception as e:
    print(f"Error applying identity preprocessing pipeline: {e}")
    print("Falling back to manual preprocessing...")
    identity_null_percentage = test_identity.isnull().mean() * 100
    identity_high_null_cols = identity_null_percentage[identity_null_percentage >= 20].index.tolist()
    test_identity_preprocessed = test_identity.drop(columns=identity_high_null_cols, errors='ignore')
    print(f"Manually preprocessed identity data shape: {test_identity_preprocessed.shape}")


Standardizing identity column names...
Renaming 38 identity columns to match training data format
Preprocessing identity data...
Dropped 19 high-null columns. New shape: (141907, 22)
Preprocessed identity data shape: (141907, 22)


In [52]:
print("Standardizing transaction column names...")
transaction_columns = test_transaction.columns.tolist()
renamed_transaction_columns = {}
for col in transaction_columns:
    if col.startswith('id-'):
        renamed_transaction_columns[col] = col.replace('id-', 'id_')
    # Add other column name standardizations if needed

if renamed_transaction_columns:
    print(f"Renaming {len(renamed_transaction_columns)} transaction columns to match training data format")
    test_transaction = test_transaction.rename(columns=renamed_transaction_columns)
else:
    print("No transaction columns need renaming")


Standardizing transaction column names...
No transaction columns need renaming


In [53]:
print("Preprocessing transaction data...")
try:
    test_transaction_preprocessed = transaction_preprocessing_pipeline.transform(test_transaction)
    print(f"Preprocessed transaction data shape: {test_transaction_preprocessed.shape}")
except Exception as e:
    print(f"Error applying transaction preprocessing pipeline: {e}")
    print("Falling back to manual preprocessing...")
    transaction_null_percentage = test_transaction.isnull().mean() * 100
    transaction_high_null_cols = transaction_null_percentage[transaction_null_percentage >= 60].index.tolist()
    test_transaction_preprocessed = test_transaction.drop(columns=transaction_high_null_cols, errors='ignore')
    print(f"Manually preprocessed transaction data shape: {test_transaction_preprocessed.shape}")


Preprocessing transaction data...
Dropped 168 high-null columns. New shape: (506691, 225)
Preprocessed transaction data shape: (506691, 225)


In [58]:
print("Adding identity flag...")
try:
    # Try using the identity flagger
    test_transaction_with_flag = identity_flagger.transform(test_transaction_preprocessed)
    print(f"Added has_identity flag. Transactions with identity: {test_transaction_with_flag['has_identity'].sum()}")
    
    # If the flagger didn't find any matches, add the flag manually
    if test_transaction_with_flag['has_identity'].sum() == 0:
        print("Identity flagger found no matches. Adding flag manually...")
        test_transaction_with_flag['has_identity'] = test_transaction_with_flag['TransactionID'].isin(
            test_identity_preprocessed['TransactionID']).astype(int)
        print(f"Added identity flag manually. Transactions with identity: {test_transaction_with_flag['has_identity'].sum()}")
except Exception as e:
    print(f"Error applying identity flagger: {e}")
    print("Adding identity flag manually...")
    test_transaction_with_flag = test_transaction_preprocessed.copy()
    test_transaction_with_flag['has_identity'] = test_transaction_with_flag['TransactionID'].isin(
        test_identity_preprocessed['TransactionID']).astype(int)
    print(f"Added identity flag manually. Transactions with identity: {test_transaction_with_flag['has_identity'].sum()}")


Adding identity flag...
Added has_identity flag. Transactions with identity: 0
Added has_identity flag. Transactions with identity: 0
Identity flagger found no matches. Adding flag manually...
Added identity flag manually. Transactions with identity: 141907


In [59]:
print("Splitting data based on identity presence...")
try:
    # Since we've manually added the flag, let's use the manual approach for consistency
    test_with_identity = test_transaction_with_flag[test_transaction_with_flag['has_identity'] == 1].copy()
    test_without_identity = test_transaction_with_flag[test_transaction_with_flag['has_identity'] == 0].copy()
    print(f"Split data manually. WITH identity: {test_with_identity.shape}, WITHOUT identity: {test_without_identity.shape}")
except Exception as e:
    print(f"Error splitting data: {e}")
    print("Falling back to manual splitting...")
    test_with_identity = test_transaction_with_flag[test_transaction_with_flag['has_identity'] == 1].copy()
    test_without_identity = test_transaction_with_flag[test_transaction_with_flag['has_identity'] == 0].copy()
    print(f"Split data manually. WITH identity: {test_with_identity.shape}, WITHOUT identity: {test_without_identity.shape}")


Splitting data based on identity presence...
Split data manually. WITH identity: (141907, 226), WITHOUT identity: (364784, 226)


In [61]:
print("Merging identity data...")
try:
    # Try using the identity merger with just one argument
    test_with_identity_merged = identity_merger.transform(test_with_identity)
    print(f"Merged data using identity merger. Shape: {test_with_identity_merged.shape}")
except Exception as e:
    print(f"Error applying identity merger: {e}")
    print("Merging data manually...")
    test_with_identity_merged = pd.merge(test_with_identity, test_identity_preprocessed, 
                                         on='TransactionID', how='left')
    print(f"Merged data manually. Shape: {test_with_identity_merged.shape}")


Merging identity data...
Merged identity data. New shape: (141907, 247)
Merged data using identity merger. Shape: (141907, 247)


In [62]:
print("\n--- Preparing Features for Prediction ---")

# Extract TransactionID for final submission
transaction_ids_with_identity = test_with_identity['TransactionID'].values
transaction_ids_without_identity = test_without_identity['TransactionID'].values

# Prepare features for WITH identity data
print("Preparing features for WITH identity data...")
X_with_identity_test = test_with_identity_merged.drop(columns=['TransactionID', 'has_identity'], errors='ignore')
print(f"WITH identity features shape: {X_with_identity_test.shape}")



--- Preparing Features for Prediction ---
Preparing features for WITH identity data...
WITH identity features shape: (141907, 245)


In [63]:
print("Preparing features for WITHOUT identity data...")
X_without_identity_test = test_without_identity.drop(columns=['TransactionID', 'has_identity'], errors='ignore')
print(f"WITHOUT identity features shape: {X_without_identity_test.shape}")


Preparing features for WITHOUT identity data...
WITHOUT identity features shape: (364784, 224)


# Apply feature pipelines

In [64]:
print("Applying WITH identity feature pipeline...")
try:
    X_with_identity_test_processed = with_identity_feature_pipeline.transform(X_with_identity_test)
    print(f"Processed WITH identity features shape: {X_with_identity_test_processed.shape}")
except Exception as e:
    print(f"Error applying WITH identity feature pipeline: {e}")
    print("Falling back to manual feature processing...")
    
    # Fill missing values
    X_with_identity_test = X_with_identity_test.fillna(-999)
    
    # Convert categorical columns to numeric
    for col in X_with_identity_test.select_dtypes(include=['object']).columns:
        X_with_identity_test[col] = pd.factorize(X_with_identity_test[col])[0]
    
    X_with_identity_test_processed = X_with_identity_test.values
    print(f"Manually processed WITH identity features shape: {X_with_identity_test_processed.shape}")


Applying WITH identity feature pipeline...
Processed WITH identity features shape: (141907, 239)


In [65]:
print("Applying WITHOUT identity feature pipeline...")
try:
    X_without_identity_test_processed = without_identity_feature_pipeline.transform(X_without_identity_test)
    print(f"Processed WITHOUT identity features shape: {X_without_identity_test_processed.shape}")
except Exception as e:
    print(f"Error applying WITHOUT identity feature pipeline: {e}")
    print("Falling back to manual feature processing...")
    
    # Fill missing values
    X_without_identity_test = X_without_identity_test.fillna(-999)
    
    # Convert categorical columns to numeric
    for col in X_without_identity_test.select_dtypes(include=['object']).columns:
        X_without_identity_test[col] = pd.factorize(X_without_identity_test[col])[0]
    
    X_without_identity_test_processed = X_without_identity_test.values
    print(f"Manually processed WITHOUT identity features shape: {X_without_identity_test_processed.shape}")


Applying WITHOUT identity feature pipeline...
Processed WITHOUT identity features shape: (364784, 236)


In [66]:
print("\n--- Creating DMatrix Objects ---")

dtest_with_identity = xgb.DMatrix(X_with_identity_test_processed)
print(f"DMatrix WITH identity created")

dtest_without_identity = xgb.DMatrix(X_without_identity_test_processed)
print(f"DMatrix WITHOUT identity created")



--- Creating DMatrix Objects ---
DMatrix WITH identity created
DMatrix WITHOUT identity created


# Make predictions

In [67]:
print("\n--- Making Predictions ---")

# Make predictions for WITH identity data
print("Generating predictions for WITH identity data...")
with_identity_preds = with_identity_model.predict(dtest_with_identity)
print(f"WITH identity predictions shape: {with_identity_preds.shape}")



--- Making Predictions ---
Generating predictions for WITH identity data...
WITH identity predictions shape: (141907,)


In [68]:
print("Generating predictions for WITHOUT identity data...")
without_identity_preds = without_identity_model.predict(dtest_without_identity)
print(f"WITHOUT identity predictions shape: {without_identity_preds.shape}")


Generating predictions for WITHOUT identity data...
WITHOUT identity predictions shape: (364784,)


In [69]:
print("\n--- Creating Submission File ---")

# Create separate DataFrames for each prediction set
with_identity_submission = pd.DataFrame({
    'TransactionID': transaction_ids_with_identity,
    'isFraud': with_identity_preds
})

without_identity_submission = pd.DataFrame({
    'TransactionID': transaction_ids_without_identity,
    'isFraud': without_identity_preds
})


--- Creating Submission File ---


In [70]:
# Combine the predictions
submission = pd.concat([with_identity_submission, without_identity_submission])

# Sort by TransactionID to maintain original order
submission = submission.sort_values('TransactionID')

# Save submission file
submission_file = 'xgboost_submission.csv'
submission.to_csv(submission_file, index=False)
print(f"Submission file saved to {submission_file}")
print(f"Final submission shape: {submission.shape}")


Submission file saved to xgboost_submission.csv
Final submission shape: (506691, 2)


In [71]:
if mlflow_active:
    mlflow.log_artifact(submission_file)
    print(f"Submission file logged to MLflow")

Submission file logged to MLflow


In [72]:
# Log execution time
execution_time = time.time() - start_time
print(f"\nTotal execution time: {execution_time:.2f} seconds")

if mlflow_active:
    mlflow.log_metric("execution_time", execution_time)
    mlflow.end_run()
    print("MLflow run completed successfully.")

print("\n--- Test Prediction Complete ---") 


Total execution time: 3125.49 seconds
🏃 View run test_prediction_20250421_112340 at: https://dagshub.com/konstantine25b/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/11/runs/8188aa4042a84ba88273678c8f59b5ed
🧪 View experiment at: https://dagshub.com/konstantine25b/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/11
MLflow run completed successfully.

--- Test Prediction Complete ---
