In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.base import BaseEstimator, TransformerMixinfrom sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import (roc_auc_score, average_precision_score, precision_score, recall_score, classification_report, f1_score)
import matplotlib.pyplot as plt

from xgboost import XGBClassifier, plot_importance
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install dagshub
!pip install mlflow
import dagshub
import mlflow

dagshub.init(repo_owner='kechik21',repo_name='ML_HW2',mlflow=True)

In [None]:
# Training data
train_tr = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
train_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')

train = train_tr.merge(train_id, on='TransactionID', how='left')
print(train_tr.shape)
print(train_id.shape)

In [None]:
print("Train Data Columns:")
print(train.columns)

In [None]:
X = train.drop(['TransactionID', 'isFraud'], axis=1)
y = train['isFraud']

In [None]:
# Differentiate categorical and numerical
cat_coluka = ['userId','P_emaildomain','R_emaildomain','DeviceType','DeviceInfo','ProductCD','addr1', 'addr2',]
cat_cards = ['card' + str(i) for i in range(1, 7)]
cat_ms = ['M' + str(i) for i in range(1, 10)]
cat_ids = ['id_' + str(i) for i in range(12, 39)]
cat_cols = cat_coluka + cat_cards + cat_ms + cat_ids

print("Categorical Columns:")
print(cat_cols)
print("\nCard Columns:")
print(cat_cards)
print("\nM Columns:")
print(cat_ms)
print("\nID Columns:")
print(cat_ids)
print("\nAll Categorical Features:")
print(cat_cols)
print("\nCategorical Columns Amount:")
print(len(cat_cols))


num_cols = [col for col in X.columns if col not in cat_cols and col != 'isFraud']

print("\nNumerical Columns Amount:")
print(len(num_cols))

# Data Cleaning

In [None]:
class FillNa(BaseEstimator, TransformerMixin):
    def __init__(self, num_cols, cat_cols):
        self.num_cols = num_cols
        self.cat_cols = cat_cols
        self.values = {}

    def fit(self, X, y=None):
        for col in self.num_cols:
            if col in X.columns:
                val = X[col].median()
                X[col] = X[col].fillna(val)
                self.values[col] = val

        for c in self.cat_cols:
            if c in X.columns:  
                if X[c].dtype in ['int64', 'float64']:
                    X[c] = X[c].astype(str)
                X[c] = X[c].fillna('NotAv')
        return self

    def transform(self, X):
        for key, value in self.values.items():
            if key in X.columns:
                X[key] = X[key].fillna(value)

        for c in self.cat_cols:
            if c in X.columns:  
                if X[c].dtype in ['int64', 'float64']:
                    X[c] = X[c].astype(str)
                X[c] = X[c].fillna('NotAv')

        return X

In [None]:
class MissingValueDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=90):
        self.columns_to_drop = None
        self.threshold = threshold

    def fit(self, X, y=None):
        missing_cols = [col for col in X.columns if X[col].isna().any()]
        high_missing_cols = [
            col for col in missing_cols 
            if (X[col].isna().mean() * 100) > self.threshold
        ]
        
        self.columns_to_drop = high_missing_cols
        print(f'Identified {len(high_missing_cols)} columns to drop')
        return self

    def transform(self, X):
        if self.columns_to_drop is None:
            raise ValueError('Needs Fitting !')
        
        return X.drop(columns=self.columns_to_drop)

# Feature Engineering

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import WOEEncoder
import pandas as pd

class CatEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=2):
        self.threshold = threshold
        self.binary_cols = []
        self.multi_cols = []
        self.encoder = None

    def fit(self, X, y):
        X = X.copy()
        
        for col in X.columns:
            if X[col].dtype == 'object':
                non_rep = X[col].nunique()
                if non_rep <= self.threshold:
                    self.binary_cols.append(col)
                else:
                    self.multi_cols.append(col)

        self.encoder = WOEEncoder(cols=self.multi_cols)
        self.encoder.fit(X[self.multi_cols], y)
        
        return self

    def transform(self, X):
        X = X.copy()
        if self.multi_cols:
            X_woe = self.encoder.transform(X[self.multi_cols])
            X[self.multi_cols] = X_woe

        if self.binary_cols:
            X = pd.get_dummies(X, columns=self.binary_cols, drop_first=True, dtype=int)
        
        return X

In [None]:
class HighCorrelationDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.85):
        self.threshold = threshold
        self.high_corr_cols_ = None  
        
    def fit(self, X, y=None):
       
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Input must be a pandas DataFrame")
        corr_matrix = X.corr().abs()
        upper_triangle = np.triu(corr_matrix, k=1)
        highly_correlated = set()
        for col in corr_matrix.columns:
            correlated_with = corr_matrix.index[upper_triangle[:, corr_matrix.columns.get_loc(col)] > self.threshold]
            highly_correlated.update(correlated_with)
            
        self.high_corr_cols_ = list(highly_correlated)
        print(f"Identified {len(self.high_corr_cols_)} highly correlated features to remove")
        
        return self
        
    def transform(self, X):
        if self.high_corr_cols_ is None:
            raise RuntimeError("Need Fitting ! ")
            
        return X.drop(columns=self.high_corr_cols_)

# Feature Selection

In [None]:
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

class BoostedFeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        target_feature_count=150,
        xgb_trees=100,
        tree_depth=6,
        lr=0.05,
        row_sampling=0.8,
        col_sampling=0.8,
        parallel_jobs=-1,
        seed=42,
        silent=True
    ):
      
        self.target_feature_count = target_feature_count
        self.xgb_trees = xgb_trees
        self.tree_depth = tree_depth
        self.lr = lr
        self.row_sampling = row_sampling
        self.col_sampling = col_sampling
        self.parallel_jobs = parallel_jobs
        self.seed = seed
        self.silent = silent
        self.kept_features_ = None
        self.feature_mask_ = None
        self.feature_ranks_ = None

    def fit(self, X, y):
        print("Starting feature selection...")
        
        xgb_model = XGBRegressor(
            n_estimators=self.xgb_trees,
            max_depth=self.tree_depth,
            learning_rate=self.lr,
            subsample=self.row_sampling,
            colsample_bytree=self.col_sampling,
            n_jobs=self.parallel_jobs,
            random_state=self.seed,
            verbosity=0 if self.silent else 1,
            tree_method='hist',
            objective='reg:squarederror'
        )
        
        if X.shape[1] > 200:
            print(" Phase 1: Fast elimination of weak features...")
            rough_selector = RFE(
                estimator=xgb_model,
                n_features_to_select=min(200, X.shape[1]),
                step=max(1, X.shape[1] // 20)  
            )
            rough_selector.fit(X, y)
            X = X.loc[:, rough_selector.support_]
        
        print(" Phase 2: Fine-tuning feature set...")
        final_selector = RFE(
            estimator=xgb_model,
            n_features_to_select=self.target_feature_count,
            step=1 
        )
        final_selector.fit(X, y)
        
        if hasattr(X, 'columns'):
            self.kept_features_ = X.columns[final_selector.support_].tolist()
        self.feature_mask_ = final_selector.support_
        self.feature_ranks_ = final_selector.ranking_
        
        print(f"Done! Selected {len(self.kept_features_)} features.")
        return self

    def transform(self, X):
       
        if self.kept_features_ is None:
            raise RuntimeError("Fit the selector first!")
        
        if isinstance(X, pd.DataFrame):
            return X[self.kept_features_]
        return X[:, self.feature_mask_]

    def get_feature_indices(self, as_indices=False):
       
        return np.where(self.feature_mask_)[0] if as_indices else self.feature_mask_

In [None]:
print('The Classes I created are working so far')

# Training

In [None]:

kva = train.drop(['TransactionID', 'isFraud'], axis=1)
target = train['isFraud']

X_train, X_val, y_train, y_val = train_test_split(kva,ta,test_size=0.3,stratify=target,random_state=42)

In [None]:

mlflow.set_tracking_uri('https://dagshub.com/kechik21/ML_HW2.mlflow')
mlflow.set_experiment("Fraud_Detection_XGBoost")

fraud_class_weight = (len(y_train) - sum(y_train)) / sum(y_train)

fraud_pipeline = Pipeline([
    # Data Cleaning
    ('missing_value_handler', MissingValueDropper(threshold=90)),
    ('data_imputer', FillNa(num_cols=num_cols, cat_cols=cat_cols)),
    
    # Feature Engineering
    ('categorical_processor', CatEncoder(threshold=2)),
    ('correlation_filter', HighCorrelationDropper(threshold=0.85)),
    ('feature_selector', BoostedFeatureSelector(
        target_feature_count=int(len(X_train.columns)*0.8),
        xgb_trees=100,
        tree_depth=6,
        lr=0.05
    )),
    
    # Model Training
    ('fraud_classifier', XGBClassifier(
        scale_pos_weight=fraud_class_weight,
        eval_metric='aucpr',
        use_label_encoder=False,
        random_state=42
    ))
])


with mlflow.start_run(run_name="Production_Fraud_Model"):
    # Log pipeline configuration
    mlflow.log_params({
        "pipeline_steps": [name for name, _ in fraud_pipeline.steps],
        "class_imbalance_ratio": fraud_class_weight,
        "feature_selection": "XGBoost_RFE"
    })
    
    print("Starting pipeline execution...")
    fraud_pipeline.fit(X_train, y_train)
    print("Training completed")
    
    fraud_probabilities = fraud_pipeline.predict_proba(X_val)[:, 1]
    fraud_predictions = fraud_pipeline.predict(X_val)
    
    pipeline_metrics = {
        "validation_auc": roc_auc_score(y_val, fraud_probabilities),
        "validation_ap": average_precision_score(y_val, fraud_probabilities),
        "validation_f1": f1_score(y_val, fraud_predictions),
        "fraud_precision": precision_score(y_val, fraud_predictions),
        "fraud_recall": recall_score(y_val, fraud_predictions)
    }
    
    mlflow.log_metrics(pipeline_metrics)
    mlflow.log_dict(
        classification_report(y_val, fraud_predictions, output_dict=True),
        "classification_report.json"
    )
    
    importance_plot = plt.figure(figsize=(12, 8))
    plot_importance(
        fraud_pipeline.named_steps['fraud_classifier'],
        ax=importance_plot.gca(),
        max_num_features=20
    )
    plt.tight_layout()
    mlflow.log_figure(importance_plot, "feature_importance.png")
    plt.close()
    
    # Register model
    mlflow.sklearn.log_model(
        fraud_pipeline,
        "fraud_detection_model",
        registered_model_name="Production_Fraud_Pipeline"
    )
    
    print(f" Pipeline deployed | Best F1: {pipeline_metrics['validation_f1']:.4f}")