In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
%pip install mlflow
%pip install dagshub

Collecting mlflow
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.3->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc, roc_curve, roc_auc_score
from sklearn.feature_selection import RFE
import xgboost as xgb
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import dagshub
import warnings
import time
from scipy import stats
import category_encoders as ce

In [6]:
# Initialize MLflow tracking
try:
    # Initialize Dagshub only if the repo info is correct
    dagshub.init(repo_owner='konstantine25b', repo_name='IEEE-CIS-Fraud-Detection', mlflow=True)
    print("DagsHub initialized successfully.")
    mlflow.set_experiment("IEEE-CIS Fraud Detection_Transaction_Only")
    print(f"MLflow experiment set to: {mlflow.get_experiment_by_name('IEEE-CIS Fraud Detection_Transaction_Only').name}")
except Exception as e:
    print(f"Could not initialize DagsHub or set MLflow experiment: {e}")
    print("Proceeding without MLflow tracking.")
    # Set a dummy client to avoid errors if tracking fails
    mlflow_active = False
else:
    mlflow_active = True

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=02a38a44-baa6-4e50-93cc-cd4f21ec07ad&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=6a3f11f274b9add921a127d1a6f1ef06aea2df0940bd9b0e27b3241a109dc20c




DagsHub initialized successfully.


2025/04/20 14:19:15 INFO mlflow.tracking.fluent: Experiment with name 'IEEE-CIS Fraud Detection_Transaction_Only' does not exist. Creating a new experiment.


MLflow experiment set to: IEEE-CIS Fraud Detection_Transaction_Only


In [7]:
run_name = f"transaction_preprocessing_and_modeling_{time.strftime('%Y%m%d_%H%M%S')}"
if mlflow_active:
    mlflow.start_run(run_name=run_name)
    print(f"MLflow run started with name: {run_name}")

# Log start time
start_time = time.time()

MLflow run started with name: transaction_preprocessing_and_modeling_20250420_142008


In [8]:
try:
    print("Loading data...")
    if mlflow_active:
        mlflow.log_param("data_source", "transaction_data_only")
    
    # Load transaction data
    transaction_df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
    
    # Log data shape
    if mlflow_active:
        mlflow.log_param("transaction_data_shape", str(transaction_df.shape))
    print(f"Transaction data shape: {transaction_df.shape}")
except FileNotFoundError:
    print("Error: Transaction CSV file was not found. Please make sure the file path is correct.")
    if mlflow_active:
        mlflow.end_run()
    exit()

Loading data...
Transaction data shape: (590540, 394)


In [9]:
y = transaction_df['isFraud']
X = transaction_df.drop('isFraud', axis=1)

# Log class distribution
class_distribution = y.value_counts().to_dict()
if mlflow_active:
    mlflow.log_param("class_distribution", str(class_distribution))
print(f"Class distribution: {class_distribution}")

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Class distribution: {0: 569877, 1: 20663}


In [10]:
if mlflow_active:
    mlflow.log_param("train_test_split", "80/20 with stratification")

# Preprocessing
print("\n--- Preprocessing ---")

# Function to preprocess d


--- Preprocessing ---


#  Preprocessing

In [16]:

def preprocess_data(df, dataset_name="unknown"):
    # Make a copy to avoid modifying the original dataframe
    df_processed = df.copy()
    
    # Log preprocessing steps with unique key
    if mlflow_active:
        mlflow.log_param(f"preprocessing_steps_{dataset_name}", "handle_missing_values, handle_categorical, handle_numerical")
    
    # 1. Handle TransactionID - drop it as it's just an identifier
    if 'TransactionID' in df_processed.columns:
        df_processed = df_processed.drop('TransactionID', axis=1)
        if mlflow_active:
            mlflow.log_param(f"drop_columns_{dataset_name}", "TransactionID")
    
    # 2. Check null percentages for all columns
    null_percentages = (df_processed.isnull().sum() / len(df_processed)) * 100
    high_null_cols = null_percentages[null_percentages >= 60].index.tolist()
    
    # Drop columns with 60% or more nulls
    if high_null_cols:
        print(f"Dropping columns with ≥60% nulls: {high_null_cols}")
        df_processed = df_processed.drop(columns=high_null_cols)
        if mlflow_active:
            # Don't log this parameter as it's too large and causing errors
            # Instead, log the count of dropped columns
            mlflow.log_param(f"dropped_high_null_columns_count_{dataset_name}", len(high_null_cols))
    
    # 3. Handle remaining missing values
    # For numerical columns, fill with median
    numerical_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns
    for col in numerical_cols:
        if df_processed[col].isnull().sum() > 0:
            median_value = df_processed[col].median()
            df_processed[col] = df_processed[col].fillna(median_value)
    
    # For categorical columns, fill with most frequent value
    categorical_cols = df_processed.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df_processed[col].isnull().sum() > 0:
            # Get most frequent value
            most_frequent = df_processed[col].mode()[0]
            df_processed[col] = df_processed[col].fillna(most_frequent)
    
    if mlflow_active:
        mlflow.log_param(f"missing_value_strategy_{dataset_name}", "median for numerical, most frequent for categorical")
    
    return df_processed

# Apply preprocessing with unique dataset names
print("Preprocessing training data...")
X_train_processed = preprocess_data(X_train, "train")
print("Preprocessing test data...")
X_test_processed = preprocess_data(X_test, "test")


Preprocessing training data...
Dropping columns with ≥60% nulls: ['dist2', 'R_emaildomain', 'D6', 'D7', 'D8', 'D9', 'D12', 'D13', 'D14', 'V138', 'V139', 'V140', 'V141', 'V142', 'V143', 'V144', 'V145', 'V146', 'V147', 'V148', 'V149', 'V150', 'V151', 'V152', 'V153', 'V154', 'V155', 'V156', 'V157', 'V158', 'V159', 'V160', 'V161', 'V162', 'V163', 'V164', 'V165', 'V166', 'V167', 'V168', 'V169', 'V170', 'V171', 'V172', 'V173', 'V174', 'V175', 'V176', 'V177', 'V178', 'V179', 'V180', 'V181', 'V182', 'V183', 'V184', 'V185', 'V186', 'V187', 'V188', 'V189', 'V190', 'V191', 'V192', 'V193', 'V194', 'V195', 'V196', 'V197', 'V198', 'V199', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V211', 'V212', 'V213', 'V214', 'V215', 'V216', 'V217', 'V218', 'V219', 'V220', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V227', 'V228', 'V229', 'V230', 'V231', 'V232', 'V233', 'V234', 'V235', 'V236', 'V237', 'V238', 'V239', 'V240', 'V241', 'V242', 'V243', 'V244', 'V245',

In [18]:
print(f"Processed training data shape: {X_train_processed.shape}")

# After preprocessing, add this code to display null percentages before and after
print("\n--- Null Value Analysis ---")
# Calculate null percentages in original data
null_percentages_original = (X_train.isnull().sum() / len(X_train)) * 100
null_cols_original = null_percentages_original[null_percentages_original > 0].sort_values(ascending=False)


Processed training data shape: (472432, 224)

--- Null Value Analysis ---


In [19]:
if len(null_cols_original) > 0:
    print("\nColumns with null values in original data (% nulls):")
    for col, pct in null_cols_original.items():
        print(f"{col}: {pct:.2f}%")
    
    # Log columns with high null percentages
    high_null_cols = null_cols_original[null_cols_original >= 60].index.tolist()
    if high_null_cols:
        print(f"\nColumns with ≥60% nulls (dropped): {len(high_null_cols)} columns")
        # Don't log the full list as it's too large
        if mlflow_active:
            mlflow.log_param("high_null_columns_count", len(high_null_cols))
else:
    print("No null values found in original data.")



Columns with null values in original data (% nulls):
dist2: 93.62%
D7: 93.40%
D13: 89.47%
D14: 89.43%
D12: 89.00%
D6: 87.56%
D9: 87.27%
D8: 87.27%
V157: 86.10%
V163: 86.10%
V162: 86.10%
V161: 86.10%
V158: 86.10%
V153: 86.10%
V138: 86.10%
V155: 86.10%
V154: 86.10%
V149: 86.10%
V148: 86.10%
V147: 86.10%
V146: 86.10%
V142: 86.10%
V141: 86.10%
V140: 86.10%
V156: 86.10%
V139: 86.10%
V152: 86.10%
V159: 86.10%
V143: 86.10%
V144: 86.10%
V145: 86.10%
V150: 86.10%
V151: 86.10%
V166: 86.10%
V160: 86.10%
V165: 86.10%
V164: 86.10%
V327: 86.03%
V338: 86.03%
V337: 86.03%
V336: 86.03%
V324: 86.03%
V335: 86.03%
V334: 86.03%
V333: 86.03%
V332: 86.03%
V325: 86.03%
V326: 86.03%
V323: 86.03%
V331: 86.03%
V330: 86.03%
V329: 86.03%
V328: 86.03%
V322: 86.03%
V339: 86.03%
V248: 77.85%
V268: 77.85%
V261: 77.85%
V240: 77.85%
V262: 77.85%
V263: 77.85%
V223: 77.85%
V264: 77.85%
V247: 77.85%
V265: 77.85%
V266: 77.85%
V267: 77.85%
V224: 77.85%
V225: 77.85%
V226: 77.85%
V228: 77.85%
V242: 77.85%
V229: 77.85%
V230: 7

In [20]:
# Check for nulls after preprocessing
null_counts_after = X_train_processed.isnull().sum()
null_cols_after = null_counts_after[null_counts_after > 0]
if len(null_cols_after) > 0:
    print("\nColumns with null values after preprocessing:")
    print(null_cols_after)
    if mlflow_active:
        mlflow.log_param("columns_with_nulls_after_preprocessing", "Yes")
else:
    print("\nNo null values found after preprocessing.")
    if mlflow_active:
        mlflow.log_param("columns_with_nulls_after_preprocessing", "None")



No null values found after preprocessing.


In [21]:
X_train_processed

Unnamed: 0,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
40809,1008491,100.000,R,6177,399.0,150.0,american express,150.0,credit,264.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
285886,7008212,29.990,W,7900,345.0,150.0,mastercard,224.0,debit,143.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104256,2071522,107.950,W,11690,111.0,150.0,visa,226.0,credit,191.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
507860,13299752,241.950,W,2616,327.0,150.0,discover,102.0,credit,330.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
196382,4412283,117.000,W,13780,298.0,150.0,visa,226.0,debit,441.0,...,117.0,117.0,117.0,117.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324259,8019613,59.000,W,10493,455.0,150.0,mastercard,126.0,debit,123.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
532790,14047023,20.903,C,5595,545.0,185.0,visa,138.0,debit,299.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29747,754797,400.000,W,9335,373.0,150.0,visa,226.0,debit,177.0,...,0.0,59.0,344.0,144.0,0.0,711.0,196.0,0.0,0.0,0.0
573130,15215188,16.354,C,3154,408.0,185.0,mastercard,224.0,debit,299.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# Display data information
print("\nData types in processed data:")
print(X_train_processed.dtypes.value_counts())

# Display summary statistics for numerical columns
print("\nSummary statistics for numerical columns (sample of 5):")
numerical_cols = X_train_processed.select_dtypes(include=['int64', 'float64']).columns
print(X_train_processed[numerical_cols[:5]].describe())



Data types in processed data:
float64    209
object      13
int64        2
Name: count, dtype: int64

Summary statistics for numerical columns (sample of 5):
       TransactionDT  TransactionAmt          card1          card2  \
count   4.724320e+05   472432.000000  472432.000000  472432.000000   
mean    7.373394e+06      135.071756    9903.027720     362.504034   
std     4.616510e+06      241.371497    4902.685441     156.585474   
min     8.640000e+04        0.251000    1001.000000     100.000000   
25%     3.028537e+06       43.140000    6019.000000     215.000000   
50%     7.309639e+06       68.911000    9689.000000     361.000000   
75%     1.124764e+07      125.000000   14203.000000     512.000000   
max     1.581113e+07    31937.391000   18396.000000     600.000000   

               card3  
count  472432.000000  
mean      153.195139  
std        11.339529  
min       100.000000  
25%       150.000000  
50%       150.000000  
75%       150.000000  
max       231.000000  


In [23]:
# Display unique values for categorical columns (sample)
print("\nUnique values for categorical columns (sample of 5):")
categorical_cols = X_train_processed.select_dtypes(include=['object']).columns
for col in categorical_cols[:5]:
    unique_values = X_train_processed[col].nunique()
    print(f"{col}: {unique_values} unique values")
    # Show sample of values if not too many
    if unique_values <= 10:
        print(f"Sample values: {X_train_processed[col].unique()[:5]}")
    else:
        print(f"Sample values: {X_train_processed[col].value_counts().head(5).to_dict()}")

# Display class distribution again for reference
print("\nClass distribution:")
print(y_train.value_counts())
print(f"Fraud ratio: {y_train.mean():.4f}")

# Display categorical column information
print("\n--- Categorical Column Analysis ---")
categorical_cols = X_train_processed.select_dtypes(include=['object']).columns
print(f"Number of categorical columns: {len(categorical_cols)}")

if len(categorical_cols) > 0:
    print("\nTop 10 categorical columns by unique value count:")
    cat_unique_counts = {col: X_train_processed[col].nunique() for col in categorical_cols}
    for col, count in sorted(cat_unique_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"{col}: {count} unique values")
        # Show top 3 most frequent values
        top_values = X_train_processed[col].value_counts().head(3)
        print(f"  Top values: {dict(top_values)}")



Unique values for categorical columns (sample of 5):
ProductCD: 5 unique values
Sample values: ['R' 'W' 'C' 'H' 'S']
card4: 4 unique values
Sample values: ['american express' 'mastercard' 'visa' 'discover']
card6: 4 unique values
Sample values: ['credit' 'debit' 'charge card' 'debit or credit']
P_emaildomain: 59 unique values
Sample values: {'gmail.com': 258203, 'yahoo.com': 80629, 'hotmail.com': 36283, 'anonymous.com': 29774, 'aol.com': 22572}
M1: 2 unique values
Sample values: ['T' 'F']

Class distribution:
isFraud
0    455902
1     16530
Name: count, dtype: int64
Fraud ratio: 0.0350

--- Categorical Column Analysis ---
Number of categorical columns: 13

Top 10 categorical columns by unique value count:
P_emaildomain: 59 unique values
  Top values: {'gmail.com': 258203, 'yahoo.com': 80629, 'hotmail.com': 36283}
ProductCD: 5 unique values
  Top values: {'W': 351450, 'C': 54987, 'R': 30251}
card4: 4 unique values
  Top values: {'visa': 308954, 'mastercard': 151547, 'american express':

# Feature Engineering

In [24]:
# Identify categorical columns
categorical_cols = X_train_processed.select_dtypes(include=['object']).columns.tolist()
if mlflow_active:
    mlflow.log_param("categorical_columns", str(categorical_cols))
print(f"Categorical columns: {len(categorical_cols)}")

# Identify numerical columns
numerical_cols = X_train_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()
if mlflow_active:
    mlflow.log_param("numerical_columns", str(len(numerical_cols)))
print(f"Numerical columns: {len(numerical_cols)}")

# Apply Weight of Evidence encoding for categorical features
print("Applying WOE encoding for categorical features...")
woe_encoder = ce.WOEEncoder(cols=categorical_cols)
X_train_woe = woe_encoder.fit_transform(X_train_processed, y_train)
X_test_woe = woe_encoder.transform(X_test_processed)


Categorical columns: 13
Numerical columns: 211
Applying WOE encoding for categorical features...


In [25]:
if mlflow_active:
    mlflow.log_param("encoding_method", "WOE")

# Feature Selection based on correlation

In [32]:
print("\n--- Feature Selection: Correlation Filter ---")

# Calculate correlation with target
correlation_with_target = pd.DataFrame()
for col in X_train_woe.columns:
    correlation = np.abs(X_train_woe[col].corr(y_train))
    correlation_with_target = pd.concat([correlation_with_target, 
                                       pd.DataFrame({'Feature': [col], 'Correlation': [correlation]})], 
                                      ignore_index=True)

# Sort by correlation
correlation_with_target = correlation_with_target.sort_values('Correlation', ascending=False)

# Select features with correlation above threshold
correlation_threshold = 0.05
selected_features_corr = correlation_with_target[correlation_with_target['Correlation'] > correlation_threshold]['Feature'].tolist()

print(f"Selected {len(selected_features_corr)} features with correlation > {correlation_threshold}")
# Filter features based on correlation
X_train_corr = X_train_woe[selected_features_corr]
X_test_corr = X_test_woe[selected_features_corr]

# Log top correlated features
top_correlated = correlation_with_target.head(20).to_dict()




--- Feature Selection: Correlation Filter ---
Selected 96 features with correlation > 0.05


# Feature Selection using RFE

In [33]:
print("\n--- Feature Selection: Recursive Feature Elimination ---")

# Initialize XGBoost classifier for RFE
base_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc'
)

# Number of features to select with RFE
n_features_to_select = min(50, len(selected_features_corr))

# Initialize RFE
rfe = RFE(estimator=base_model, n_features_to_select=n_features_to_select, step=0.1)

# Fit RFE
rfe.fit(X_train_corr, y_train)

# Get selected features
selected_features_rfe = X_train_corr.columns[rfe.support_].tolist()

print(f"Selected {len(selected_features_rfe)} features with RFE")


--- Feature Selection: Recursive Feature Elimination ---
Selected 50 features with RFE


In [36]:
if mlflow_active:
    mlflow.log_param("features_after_rfe", len(selected_features_rfe))
    mlflow.log_param("rfe_features", str(selected_features_rfe))

# Filter features based on RFE
X_train_rfe = X_train_corr[selected_features_rfe]
X_test_rfe = X_test_corr[selected_features_rfe]

In [38]:
X_train_rfe.head()
X_train_rfe.columns

Index(['V45', 'V86', 'V87', 'V44', 'V52', 'V40', 'V79', 'V38', 'ProductCD',
       'V94', 'V74', 'V15', 'card3', 'V37', 'V73', 'V58', 'V123', 'V47', 'V72',
       'V71', 'V78', 'V63', 'V64', 'V303', 'V283', 'V125', 'V112', 'V46',
       'V29', 'card6', 'V70', 'V91', 'V108', 'V48', 'V124', 'D15', 'V281',
       'V67', 'V62', 'M6', 'D1', 'P_emaildomain', 'V83', 'D2', 'V109', 'V115',
       'D4', 'V36', 'V56', 'V61'],
      dtype='object')

In [40]:
mlflow.log_param("features_after_rfe", len(selected_features_rfe))
mlflow.log_param("rfe_features", str(selected_features_rfe))

"['V45', 'V86', 'V87', 'V44', 'V52', 'V40', 'V79', 'V38', 'ProductCD', 'V94', 'V74', 'V15', 'card3', 'V37', 'V73', 'V58', 'V123', 'V47', 'V72', 'V71', 'V78', 'V63', 'V64', 'V303', 'V283', 'V125', 'V112', 'V46', 'V29', 'card6', 'V70', 'V91', 'V108', 'V48', 'V124', 'D15', 'V281', 'V67', 'V62', 'M6', 'D1', 'P_emaildomain', 'V83', 'D2', 'V109', 'V115', 'D4', 'V36', 'V56', 'V61']"

# Model Training with XGBoost

In [60]:
print("\n--- Model Training: XGBoost ---")

# Define pipeline with scaler and XGBoost classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', xgb.XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='auc',
        random_state=42,
        n_jobs=-1
    ))
])

# Define hyperparameter grid
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [4, 6, 8],
    'classifier__learning_rate': [0.01, 0.05],
    'classifier__scale_pos_weight': [25, 35],  # To handle class imbalance
    'classifier__min_child_weight': [1, 3],
    'classifier__subsample': [0.8],
    'classifier__colsample_bytree': [0.8],
    'classifier__gamma': [0.1],
    'classifier__reg_alpha': [0.1],
    'classifier__reg_lambda': [1.0]
}






--- Model Training: XGBoost ---


In [61]:
# Set up cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Sample a subset of data for hyperparameter tuning to speed up the process
sample_size = int(0.1 * len(X_train_rfe))  # Use 10% of data
indices = np.random.choice(len(X_train_rfe), sample_size, replace=False)
X_train_sample = X_train_rfe.iloc[indices]
y_train_sample = y_train.iloc[indices]

print(f"Using {sample_size} samples for hyperparameter tuning...")


Using 47243 samples for hyperparameter tuning...


In [62]:
# Set up GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=cv,
    scoring='f1',  # Optimize for F1 score
    n_jobs=-1,
    verbose=1
)

# Fit GridSearchCV
grid_search.fit(X_train_sample, y_train_sample)


Fitting 3 folds for each of 48 candidates, totalling 144 fits


In [63]:
best_params = grid_search.best_params_
print(f"\nBest parameters: {best_params}")

# Safe MLflow logging
try:
    if mlflow_active:
        for param, value in best_params.items():
            param_name = f"best_{param.replace('classifier__', '')}"
            mlflow.log_param(param_name, value)
        mlflow.log_metric("best_cv_f1", grid_search.best_score_)
except Exception as e:
    print(f"Warning: Could not log hyperparameters to MLflow: {e}")



Best parameters: {'classifier__colsample_bytree': 0.8, 'classifier__gamma': 0.1, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 8, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 200, 'classifier__reg_alpha': 0.1, 'classifier__reg_lambda': 1.0, 'classifier__scale_pos_weight': 25, 'classifier__subsample': 0.8}


In [64]:
best_xgb = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=best_params.get('classifier__n_estimators', 100),
    max_depth=best_params.get('classifier__max_depth', 6),
    learning_rate=best_params.get('classifier__learning_rate', 0.01),
    scale_pos_weight=best_params.get('classifier__scale_pos_weight', 30),
    min_child_weight=best_params.get('classifier__min_child_weight', 1),
    subsample=best_params.get('classifier__subsample', 0.8),
    colsample_bytree=best_params.get('classifier__colsample_bytree', 0.8),
    gamma=best_params.get('classifier__gamma', 0),
    reg_alpha=best_params.get('classifier__reg_alpha', 0),
    reg_lambda=best_params.get('classifier__reg_lambda', 1),
    random_state=42,
    use_label_encoder=False,
    eval_metric='auc',
    n_jobs=-1
)

# Use a validation set for early stopping
X_train_fit, X_val, y_train_fit, y_val = train_test_split(
    X_train_rfe, y_train, test_size=0.2, random_state=42, stratify=y_train
)

In [65]:
# Apply StandardScaler
scaler = StandardScaler()
X_train_fit_scaled = scaler.fit_transform(X_train_fit)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_rfe)

# Train the model
print("\nTraining final model with best parameters...")
best_xgb.fit(
    X_train_fit_scaled, y_train_fit,
    eval_set=[(X_val_scaled, y_val)],
    verbose=True,
    early_stopping_rounds=50
)


Training final model with best parameters...




[0]	validation_0-auc:0.82108
[1]	validation_0-auc:0.83021
[2]	validation_0-auc:0.83642
[3]	validation_0-auc:0.83783
[4]	validation_0-auc:0.83928
[5]	validation_0-auc:0.83950
[6]	validation_0-auc:0.83956
[7]	validation_0-auc:0.84102
[8]	validation_0-auc:0.84112
[9]	validation_0-auc:0.84255
[10]	validation_0-auc:0.84282
[11]	validation_0-auc:0.84338
[12]	validation_0-auc:0.84397
[13]	validation_0-auc:0.84465
[14]	validation_0-auc:0.84519
[15]	validation_0-auc:0.84571
[16]	validation_0-auc:0.84593
[17]	validation_0-auc:0.84675
[18]	validation_0-auc:0.84757
[19]	validation_0-auc:0.84822
[20]	validation_0-auc:0.84874
[21]	validation_0-auc:0.84964
[22]	validation_0-auc:0.85020
[23]	validation_0-auc:0.85056
[24]	validation_0-auc:0.85096
[25]	validation_0-auc:0.85183
[26]	validation_0-auc:0.85222
[27]	validation_0-auc:0.85308
[28]	validation_0-auc:0.85359
[29]	validation_0-auc:0.85371
[30]	validation_0-auc:0.85386
[31]	validation_0-auc:0.85433
[32]	validation_0-auc:0.85425
[33]	validation_0-au

In [66]:
# Find optimal threshold
print("\n--- Finding Optimal Classification Threshold ---")
y_val_pred_proba = best_xgb.predict_proba(X_val_scaled)[:, 1]
precision_curve, recall_curve, thresholds = precision_recall_curve(y_val, y_val_pred_proba)

# Calculate F1 score for each threshold
f1_scores = 2 * (precision_curve * recall_curve) / (precision_curve + recall_curve + 1e-10)
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx] if optimal_idx < len(thresholds) else 0.5

print(f"Optimal threshold: {optimal_threshold:.4f}")
print(f"At optimal threshold - Precision: {precision_curve[optimal_idx]:.4f}, Recall: {recall_curve[optimal_idx]:.4f}, F1: {f1_scores[optimal_idx]:.4f}")



--- Finding Optimal Classification Threshold ---
Optimal threshold: 0.8045
At optimal threshold - Precision: 0.5419, Recall: 0.4032, F1: 0.4624


In [67]:
# Safe MLflow logging
try:
    if mlflow_active:
        mlflow.log_param("optimal_threshold", optimal_threshold)
        mlflow.log_metric("optimal_precision", precision_curve[optimal_idx])
        mlflow.log_metric("optimal_recall", recall_curve[optimal_idx])
        mlflow.log_metric("optimal_f1", f1_scores[optimal_idx])
except Exception as e:
    print(f"Warning: Could not log threshold metrics to MLflow: {e}")

# Model Evaluation
print("\n--- Model Evaluation ---")


The cause of this error is typically due to repeated calls
to an individual run_id event logging.

Incorrect Example:
---------------------------------------
with mlflow.start_run():
    mlflow.log_param("depth", 3)
    mlflow.log_param("depth", 5)
---------------------------------------

Which will throw an MlflowException for overwriting a
logged parameter.

Correct Example:
---------------------------------------
with mlflow.start_run():
    with mlflow.start_run(nested=True):
        mlflow.log_param("depth", 3)
    with mlflow.start_run(nested=True):
        mlflow.log_param("depth", 5)
---------------------------------------

Which will create a new nested run for each individual
model and prevent parameter key collisions within the
tracking store.

--- Model Evaluation ---


In [68]:
# Make predictions with optimal threshold
y_pred_proba = best_xgb.predict_proba(X_test_scaled)[:, 1]
y_pred = (y_pred_proba >= optimal_threshold).astype(int)

# Classification report
report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

# Log metrics
if mlflow_active:
    mlflow.log_metric("accuracy", report['accuracy'])
    mlflow.log_metric("precision_class_0", report['0']['precision'])
    mlflow.log_metric("recall_class_0", report['0']['recall'])
    mlflow.log_metric("f1_class_0", report['0']['f1-score'])
    mlflow.log_metric("precision_class_1", report['1']['precision'])
    mlflow.log_metric("recall_class_1", report['1']['recall'])
    mlflow.log_metric("f1_class_1", report['1']['f1-score'])


              precision    recall  f1-score   support

           0       0.98      0.99      0.98    113975
           1       0.55      0.40      0.47      4133

    accuracy                           0.97    118108
   macro avg       0.77      0.70      0.72    118108
weighted avg       0.96      0.97      0.97    118108



In [69]:

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)
if mlflow_active:
    mlflow.log_metric("roc_auc", roc_auc)

# Feature Importance
feature_importances = pd.DataFrame({
    'Feature': X_train_rfe.columns,
    'Importance': best_xgb.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 Important Features:")
print(feature_importances.head(10))

# Log feature importances
if mlflow_active:
    mlflow.log_param("top_features", str(feature_importances.head(10).to_dict()))

# Log model
if mlflow_active:
    mlflow.xgboost.log_model(best_xgb, "xgboost_model")

# Log execution time
execution_time = time.time() - start_time
print(f"\nExecution time: {execution_time:.2f} seconds")
if mlflow_active:
    mlflow.log_metric("execution_time", execution_time)



Top 10 Important Features:
      Feature  Importance
30        V70    0.190915
31        V91    0.150288
8   ProductCD    0.051909
0         V45    0.045423
12      card3    0.039899
9         V94    0.034884
24       V283    0.032726
29      card6    0.030336
2         V87    0.029235
28        V29    0.020129


MlflowException: INVALID_PARAMETER_VALUE: Response: {'error_code': 'INVALID_PARAMETER_VALUE'}

The cause of this error is typically due to repeated calls
to an individual run_id event logging.

Incorrect Example:
---------------------------------------
with mlflow.start_run():
    mlflow.log_param("depth", 3)
    mlflow.log_param("depth", 5)
---------------------------------------

Which will throw an MlflowException for overwriting a
logged parameter.

Correct Example:
---------------------------------------
with mlflow.start_run():
    with mlflow.start_run(nested=True):
        mlflow.log_param("depth", 3)
    with mlflow.start_run(nested=True):
        mlflow.log_param("depth", 5)
---------------------------------------

Which will create a new nested run for each individual
model and prevent parameter key collisions within the
tracking store.

In [70]:
if mlflow_active:
    print("\nMLflow tracking completed. Run ID:", mlflow.active_run().info.run_id)
    mlflow.end_run()
else:
    print("\nMLflow tracking was not active.") 


MLflow tracking completed. Run ID: 7b4b0905b5464487a50094ef5398a509
🏃 View run transaction_preprocessing_and_modeling_20250420_142008 at: https://dagshub.com/konstantine25b/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/9/runs/7b4b0905b5464487a50094ef5398a509
🧪 View experiment at: https://dagshub.com/konstantine25b/IEEE-CIS-Fraud-Detection.mlflow/#/experiments/9
