# 2.0 - Data Preprocessing and Feature Engineering

This notebook demonstrates the application of the data processing pipeline defined in `src/data_processing.py`. The goal is to transform the raw transaction data into a model-ready format by extracting features, creating aggregates, handling missing values, encoding categorical variables (including Weight of Evidence), and scaling numerical features.


### 2.1 Setup and Load Data

#### First, import necessary libraries and load the raw data. We'll be importing our custom functions from `src/data_processing`.


In [1]:
import pandas as pd
import numpy as np
import sys
import os

### Add the src directory to the Python path to import custom modules

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.data_processing import run_preprocessing

#### Import the core preprocessing function from src script

In [3]:
class PreprocessingRunner:
    """
    A class to encapsulate the data loading, preprocessing execution,
    and verification steps for the credit risk fraud detection project.
    """

    def __init__(self, data_path, target_column="FraudResult"):
        """
        Initializes the runner by loading the raw data.

        Args:
            data_path (str): The file path to the raw dataset (e.g., 'data/raw/transactions.csv').
            target_column (str): The name of the target variable column.
        """
        self.data_path = data_path
        self.target_column = target_column
        self.df_raw = self._load_raw_data()
        self.X_processed = None
        self.y = None

    def _load_raw_data(self):
        """Loads the raw dataset or creates dummy data if not found."""
        try:
            df = pd.read_csv(self.data_path)
            print("Raw data loaded successfully.")
            return df
        except FileNotFoundError:
            print(f"Error: Raw data file '{self.data_path}' not found.")
            print(
                "Creating dummy data for demonstration purposes as the file was not found."
            )
            n_rows = 1000
            dummy_data = {
                "TransactionId": range(1, n_rows + 1),
                "BatchId": np.random.randint(1, 50, n_rows),
                "AccountId": np.random.randint(1000, 5000, n_rows),
                "SubscriptionId": np.random.randint(100, 300, n_rows),
                "CustomerId": np.random.randint(1000, 5000, n_rows),
                "CurrencyCode": np.random.choice(
                    ["KES", "USD", "EUR"], n_rows, p=[0.7, 0.2, 0.1]
                ),
                "CountryCode": np.random.randint(254, 300, n_rows),
                "ProviderId": np.random.randint(1, 10, n_rows),
                "ProductId": np.random.randint(10000, 10010, n_rows),
                "ProductCategory": np.random.choice(
                    ["Bills", "Airtime", "Data", "Other"],
                    n_rows,
                    p=[0.4, 0.3, 0.2, 0.1],
                ),
                "ChannelId": np.random.choice(
                    ["Web", "Android", "IOS", "Pay Later", "Checkout"], n_rows
                ),
                "Amount": np.random.uniform(-10000, 50000, n_rows),
                "Value": np.random.uniform(0, 50000, n_rows),
                "TransactionStartTime": pd.to_datetime("2024-01-01")
                + pd.to_timedelta(
                    np.random.randint(0, 365 * 24 * 60 * 60, n_rows), unit="s"
                ),
                "PricingStrategy": np.random.choice([1, 2, 3, 4, 5], n_rows),
                "FraudResult": np.random.choice([0, 1], n_rows, p=[0.95, 0.05]),
            }
            df = pd.DataFrame(dummy_data)
            print("Using dummy data for demonstration.")
            return df

    def display_raw_data_info(self):
        """Displays initial information about the raw data."""
        print(f"\nInitial raw data shape: {self.df_raw.shape}")
        print("Raw data head:")
        print(self.df_raw.head())
        print("\nRaw data info:")
        self.df_raw.info()

    def run_preprocessing_pipeline(self):
        """Executes the preprocessing pipeline from src/data_processing.py."""
        print("\n--- Running Preprocessing Pipeline ---")
        self.X_processed, self.y = run_preprocessing(
            self.df_raw.copy(), target_column=self.target_column
        )
        print("\nPreprocessing complete.")

    def verify_transformed_data(self):
        """Verifies the transformed data, checking for new features, encoding, and scaling."""
        if self.X_processed is None or self.y is None:
            print(
                "Preprocessing has not been run yet. Please run `run_preprocessing_pipeline()` first."
            )
            return

        print("\n--- Preprocessing Results Verification ---")
        print("Processed features (X) head:")
        print(self.X_processed.head())

        print("\nProcessed features (X) info:")
        self.X_processed.info()

        print("\nTarget variable (y) head:")
        print(self.y.head())

        print("\nCheck for missing values in processed X (should be 0):")
        total_missing = self.X_processed.isnull().sum().sum()
        print(f"Total missing values: {total_missing}")
        if total_missing != 0:
            print("Warning: Missing values still found in processed data!")

        print("\nCheck data types of processed features:")
        print(self.X_processed.dtypes.value_counts())

        print("\nVerifying new time-based features (should be numerical):")
        time_cols = [
            "transaction_hour",
            "transaction_day_of_week",
            "transaction_day_of_month",
            "transaction_month",
            "transaction_year",
        ]
        existing_time_cols = [
            col for col in time_cols if col in self.X_processed.columns
        ]
        if existing_time_cols:
            print(self.X_processed[existing_time_cols].head())
            print("\nData types of time-based features:")
            print(self.X_processed[existing_time_cols].dtypes)
        else:
            print(
                "No time-based features found. Check `DateTimeFeatureExtractor` in `data_processing.py`."
            )

        print("\nVerifying new aggregated features (should be numerical):")
        agg_cols_in_processed = [
            col for col in self.X_processed.columns if "_accountid" in col
        ]
        if agg_cols_in_processed:
            print(self.X_processed[agg_cols_in_processed].head())
            print("\nData types of aggregated features:")
            print(self.X_processed[agg_cols_in_processed].dtypes)
        else:
            print(
                "No aggregate features found in processed X. Check `create_aggregated_features` in `data_processing.py`."
            )

        print(
            "\nVerifying encoding of 'CurrencyCode' (One-Hot Encoded - should be binary):"
        )
        ohe_currency_cols = [
            col for col in self.X_processed.columns if "CurrencyCode_" in col
        ]
        if ohe_currency_cols:
            print(self.X_processed[ohe_currency_cols].head())
            # A simple check for binary values
            if all(
                self.X_processed[col].isin([0, 1]).all() for col in ohe_currency_cols
            ):
                print("One-Hot Encoded columns appear to be binary.")
            else:
                print("Warning: One-Hot Encoded columns contain non-binary values.")
        else:
            print(
                "One-Hot Encoded CurrencyCode columns not found. Check names or encoding setup."
            )

        print(
            "\nVerifying encoding of 'ProductCategory' and 'ChannelId' (WOE Encoded - should be numerical):"
        )
        woe_cols_check = [
            "ProductCategory",
            "ChannelId",
        ]  # These should retain their names but be numerical
        existing_woe_cols = [
            col for col in woe_cols_check if col in self.X_processed.columns
        ]
        if existing_woe_cols:
            print(self.X_processed[existing_woe_cols].head())
            print("\nData types of WOE encoded columns:")
            print(self.X_processed[existing_woe_cols].dtypes)
            if all(
                pd.api.types.is_numeric_dtype(self.X_processed[col])
                for col in existing_woe_cols
            ):
                print("WOE encoded columns are numerical as expected.")
            else:
                print("Warning: WOE encoded columns are not all numerical.")
        else:
            print(
                "WOE encoded columns not found or not numerical. Check names or encoding setup."
            )

        print("\nVerifying scaling of numerical features (e.g., Amount, Value):")
        scaled_check_cols = [
            "Amount",
            "Value",
            "CountryCode",
            "PricingStrategy",
            "transaction_hour",
            "amount_total_accountid",
            "value_avg_accountid",
        ]  # Add others you expect to be scaled
        for col in scaled_check_cols:
            if col in self.X_processed.columns and pd.api.types.is_numeric_dtype(
                self.X_processed[col]
            ):
                # Check if mean is close to 0 and std dev close to 1
                if np.isclose(self.X_processed[col].mean(), 0, atol=0.1) and np.isclose(
                    self.X_processed[col].std(), 1, atol=0.1
                ):
                    print(
                        f"{col}: Mean = {self.X_processed[col].mean():.4f} (close to 0), Std Dev = {self.X_processed[col].std():.4f} (close to 1) - Appears scaled."
                    )
                else:
                    print(
                        f"{col}: Mean = {self.X_processed[col].mean():.4f}, Std Dev = {self.X_processed[col].std():.4f} - May not be scaled correctly or has specific distribution."
                    )
            elif col not in self.X_processed.columns:
                print(f"{col} not found in processed X for scaling check.")
            else:
                print(f"{col} is not numerical in processed X, skipping scaling check.")

# Run Preprocessing Pipeline

### Define the path to raw data

In [4]:
data_path = "../../data/raw/data.csv"
target_column = "FraudResult"

#### Instantiate the PreprocessingRunner

In [5]:
runner = PreprocessingRunner(data_path=data_path, target_column=target_column)

Raw data loaded successfully.


## # Display initial raw data info

In [6]:
runner.display_raw_data_info()


Initial raw data shape: (95662, 16)
Raw data head:
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCat

# Run the preprocessing pipeline

In [7]:
runner.run_preprocessing_pipeline()


--- Running Preprocessing Pipeline ---
Starting preprocessing...
Preprocessing complete.

Preprocessing complete.


# Verify the transformed data

In [8]:
runner.verify_transformed_data()


--- Preprocessing Results Verification ---
Processed features (X) head:
   feature_0  feature_1  feature_2  feature_3  feature_4  feature_5  \
0  -0.046371  -0.072291        0.0  -0.349252  -2.155530  -0.006389   
1  -0.054643  -0.080251        0.0  -0.349252  -2.155530  -0.006389   
2  -0.050426  -0.076352        0.0  -0.349252  -2.155530  -0.006389   
3   0.107717   0.096648        0.0  -0.349252  -1.949214  -0.006389   
4  -0.059704  -0.075183        0.0  -0.349252  -1.949214  -0.006389   

   feature_6  feature_7  feature_8  feature_9  feature_10  feature_11  \
0  -0.100739   0.848684  -0.994246  -0.718149         1.0    1.620379   
1  -0.100739   0.848684  -0.994246   1.444841         1.0   -0.565446   
2  -0.100739   0.848684  -0.994246  -0.722639         1.0    1.620379   
3  -0.100739   0.848684  -0.994246  -0.720955         1.0   -1.134963   
4  -0.100739   0.848684  -0.994246   1.444841         1.0   -0.565446   

   feature_12  feature_13   feature_14    feature_15  feature