In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import joblib
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("‚úÖ Libraries imported successfully")

‚úÖ Libraries imported successfully


In [3]:
# Set up paths
data_path = Path("../app/artifacts/eda_processed_data.csv")
output_dir = Path("../app/artifacts")

# Create output directory if it doesn't exist
output_dir.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Data path: {data_path}")
print(f"üìÅ Output directory: {output_dir}")

üìÅ Data path: ..\app\artifacts\eda_processed_data.csv
üìÅ Output directory: ..\app\artifacts


In [4]:
# Load Data
print("üìä Loading processed data...")
df = pd.read_csv(data_path)
print(f"‚úÖ Data loaded successfully! Shape: {df.shape}")
print(f"üìä Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


üìä Loading processed data...
‚úÖ Data loaded successfully! Shape: (118108, 434)
üìä Memory usage: 502.80 MB


In [5]:
# Display basic info
print("\nÔøΩÔøΩ Data Info:")
print(f"Columns: {df.shape[1]}")
print(f"Rows: {df.shape[0]}")
print(f"Target distribution:\n{df['isFraud'].value_counts(normalize=True).round(3)}")


ÔøΩÔøΩ Data Info:
Columns: 434
Rows: 118108
Target distribution:
isFraud
0    0.965
1    0.035
Name: proportion, dtype: float64


In [6]:
# Feature Preprocessing
print("üîß Starting feature preprocessing...")

# Separate features and target
target_col = 'isFraud'
X = df.drop(columns=[target_col])
y = df[target_col]

print(f"‚úÖ Features (X): {X.shape}")
print(f"‚úÖ Target (y): {y.shape}")
print(f"‚úÖ Target distribution: {y.value_counts().to_dict()}")

üîß Starting feature preprocessing...
‚úÖ Features (X): (118108, 433)
‚úÖ Target (y): (118108,)
‚úÖ Target distribution: {0: 113975, 1: 4133}


In [7]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"üìä Categorical columns: {len(categorical_cols)}")
print(f" Numerical columns: {len(numerical_cols)}")
print(f"\nüî§ Categorical columns: {categorical_cols[:10]}{'...' if len(categorical_cols) > 10 else ''}")
print(f" Numerical columns: {numerical_cols[:10]}{'...' if len(numerical_cols) > 10 else ''}")

üìä Categorical columns: 31
 Numerical columns: 402

üî§ Categorical columns: ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'M1', 'M2', 'M3', 'M4', 'M5']...
 Numerical columns: ['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1']...


In [8]:
# Handle missing values in numerical columns
print("üîß Handling missing values in numerical columns...")
X_numerical = X[numerical_cols].copy()
X_numerical = X_numerical.fillna(-999)

missing_counts = X_numerical.isnull().sum()
print(f"‚úÖ Missing values filled with -999")
print(f"üìä Missing value summary: {missing_counts.sum()} total missing values")

üîß Handling missing values in numerical columns...
‚úÖ Missing values filled with -999
üìä Missing value summary: 0 total missing values


In [9]:
# Frequency encoding for categorical columns
print(" Applying frequency encoding to categorical columns...")
X_categorical = X[categorical_cols].copy()

# Fill missing values with 'MISSING' first
X_categorical = X_categorical.fillna('MISSING')

# Apply frequency encoding
for col in categorical_cols:
    value_counts = X_categorical[col].value_counts()
    X_categorical[col] = X_categorical[col].map(value_counts)
    # Fill any remaining NaN with 0 (for unseen values)
    X_categorical[col] = X_categorical[col].fillna(0)

print(f"‚úÖ Frequency encoding completed for {len(categorical_cols)} categorical columns")

 Applying frequency encoding to categorical columns...
‚úÖ Frequency encoding completed for 31 categorical columns


In [10]:
# Combine numerical and categorical features
print("üîß Combining all features...")
X_processed = pd.concat([X_numerical, X_categorical], axis=1)

print(f"‚úÖ Final feature matrix shape: {X_processed.shape}")
print(f"‚úÖ Feature types: {X_processed.dtypes.value_counts().to_dict()}")
print(f"‚úÖ Memory usage: {X_processed.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

üîß Combining all features...
‚úÖ Final feature matrix shape: (118108, 433)
‚úÖ Feature types: {dtype('float64'): 399, dtype('int64'): 34}
‚úÖ Memory usage: 390.17 MB


In [11]:
# Data Split
print("‚úÇÔ∏è Splitting data into train/validation sets...")
X_train, X_val, y_train, y_val = train_test_split(
    X_processed, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"‚úÖ Training set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X_processed)*100:.1f}%)")
print(f"‚úÖ Validation set: {X_val.shape[0]} samples ({X_val.shape[0]/len(X_processed)*100:.1f}%)")
print(f"‚úÖ Training fraud rate: {y_train.mean():.4f}")
print(f"‚úÖ Validation fraud rate: {y_val.mean():.4f}")

‚úÇÔ∏è Splitting data into train/validation sets...
‚úÖ Training set: 94486 samples (80.0%)
‚úÖ Validation set: 23622 samples (20.0%)
‚úÖ Training fraud rate: 0.0350
‚úÖ Validation fraud rate: 0.0350


In [12]:
# Train Model
print("üöÄ Training XGBoost baseline model...")

# Initialize XGBoost classifier
model = xgb.XGBClassifier(
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=6,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='auc',
    early_stopping_rounds=10,
    verbose=100
)

print("‚úÖ XGBoost model initialized")
print(f"üìä Model parameters: {model.get_params()}")

üöÄ Training XGBoost baseline model...
‚úÖ XGBoost model initialized
üìä Model parameters: {'objective': 'binary:logistic', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.8, 'device': None, 'early_stopping_rounds': 10, 'enable_categorical': False, 'eval_metric': 'auc', 'feature_types': None, 'feature_weights': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.1, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 6, 'max_leaves': None, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 1000, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': None, 'reg_lambda': None, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.8, 'tree_method': None, 'validate_parameters': None, 'verbosity': 

In [13]:
# Train the model with early stopping
print("üî• Starting training...")
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

print("‚úÖ Training completed!")
print(f"üìä Best iteration: {model.best_iteration}")
print(f"üìä Best validation AUC: {model.best_score:.4f}")

üî• Starting training...
[0]	validation_0-auc:0.79738
[1]	validation_0-auc:0.84279
[2]	validation_0-auc:0.85431
[3]	validation_0-auc:0.86164
[4]	validation_0-auc:0.86270
[5]	validation_0-auc:0.86548
[6]	validation_0-auc:0.86907
[7]	validation_0-auc:0.87041
[8]	validation_0-auc:0.87311
[9]	validation_0-auc:0.87465
[10]	validation_0-auc:0.87504
[11]	validation_0-auc:0.87940
[12]	validation_0-auc:0.87917
[13]	validation_0-auc:0.88039
[14]	validation_0-auc:0.88051
[15]	validation_0-auc:0.88273
[16]	validation_0-auc:0.88341
[17]	validation_0-auc:0.88373
[18]	validation_0-auc:0.88455
[19]	validation_0-auc:0.88547
[20]	validation_0-auc:0.88595
[21]	validation_0-auc:0.88631
[22]	validation_0-auc:0.88664
[23]	validation_0-auc:0.88679
[24]	validation_0-auc:0.88671
[25]	validation_0-auc:0.88728
[26]	validation_0-auc:0.88788
[27]	validation_0-auc:0.88920
[28]	validation_0-auc:0.89058
[29]	validation_0-auc:0.89170
[30]	validation_0-auc:0.89234
[31]	validation_0-auc:0.89253
[32]	validation_0-auc:0.

In [14]:
# Evaluate Performance
print("üìä Evaluating model performance...")

# Make predictions
y_pred_proba = model.predict_proba(X_val)[:, 1]
y_pred = model.predict(X_val)

# Calculate metrics
roc_auc = roc_auc_score(y_val, y_pred_proba)

print("üéØ PERFORMANCE RESULTS:")
print("=" * 50)
print(f"ROC-AUC Score: {roc_auc:.4f}")
print("=" * 50)

# This is our official baseline to beat!
print(f"\n BASELINE TARGET: {roc_auc:.4f}")
print("This ROC-AUC score is our official target to beat with graph features!")

üìä Evaluating model performance...
üéØ PERFORMANCE RESULTS:
ROC-AUC Score: 0.9213

 BASELINE TARGET: 0.9213
This ROC-AUC score is our official target to beat with graph features!


In [15]:
# Print detailed classification report
print("üìã Detailed Classification Report:")
print("=" * 50)
print(classification_report(y_val, y_pred, target_names=['Legitimate', 'Fraud']))
print("=" * 50)

üìã Detailed Classification Report:
              precision    recall  f1-score   support

  Legitimate       0.98      1.00      0.99     22795
       Fraud       0.88      0.42      0.57       827

    accuracy                           0.98     23622
   macro avg       0.93      0.71      0.78     23622
weighted avg       0.98      0.98      0.97     23622



In [16]:
# Feature importance analysis
print("üîç Analyzing feature importance...")

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_processed.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

print("üèÜ Top 20 Most Important Features:")
print(feature_importance.head(20))

üîç Analyzing feature importance...
üèÜ Top 20 Most Important Features:
    feature  importance
296    V257    0.177193
262    V223    0.137105
228    V189    0.063457
240    V201    0.049679
370    V331    0.021664
109     V70    0.019399
130     V91    0.018180
18       C8    0.017562
297    V258    0.016975
129     V90    0.015661
392   id_17    0.013994
229    V190    0.009198
24      C14    0.008831
356    V317    0.007783
401   id_32    0.007172
22      C12    0.005891
177    V138    0.005140
11       C1    0.005013
347    V308    0.004949
8     addr2    0.004523


In [17]:
# Plot top 15 features
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
top_features = feature_importance.head(15)
plt.barh(range(len(top_features)), top_features['importance'])
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Feature Importance')
plt.title('Top 15 Most Important Features (Baseline XGBoost)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('baseline_feature_importance.png', dpi=100, bbox_inches='tight')
plt.close()
print("‚úÖ Feature importance plot saved as 'baseline_feature_importance.png'")

‚úÖ Feature importance plot saved as 'baseline_feature_importance.png'


In [18]:
# Save Artifacts
print("üíæ Saving model artifacts...")

# Save the trained model
model_path = output_dir / "baseline_model.pkl"
joblib.dump(model, model_path)
print(f"‚úÖ Baseline model saved to: {model_path}")

üíæ Saving model artifacts...
‚úÖ Baseline model saved to: ..\app\artifacts\baseline_model.pkl


In [19]:
# Save feature names for later use
feature_names_path = output_dir / "baseline_feature_names.pkl"
joblib.dump(X_processed.columns.tolist(), feature_names_path)
print(f"‚úÖ Feature names saved to: {feature_names_path}")


‚úÖ Feature names saved to: ..\app\artifacts\baseline_feature_names.pkl


In [20]:
# Save feature importance
importance_path = output_dir / "baseline_feature_importance.csv"
feature_importance.to_csv(importance_path, index=False)
print(f"‚úÖ Feature importance saved to: {importance_path}")

print("\n BASELINE MODEL TRAINING COMPLETED!")
print(f"üèÜ Your baseline ROC-AUC target: {roc_auc:.4f}")
print("\nNext step: Build graph features and try to beat this baseline!")

‚úÖ Feature importance saved to: ..\app\artifacts\baseline_feature_importance.csv

 BASELINE MODEL TRAINING COMPLETED!
üèÜ Your baseline ROC-AUC target: 0.9213

Next step: Build graph features and try to beat this baseline!
