In [90]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import scorecardpy as sc

In [91]:
# # 1Create Aggregate Features

In [92]:
# Load your dataset (replace 'your_file.csv' with your actual file path)
df = pd.read_csv('../data/proccessed/cleaned_data.csv')

In [93]:
# Total Transaction Amount per customer
df['TotalTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('sum')

# Average Transaction Amount per customer
df['AverageTransactionAmount'] = df.groupby('CustomerId')['Amount'].transform('mean')

# Transaction Count per customer
df['TransactionCount'] = df.groupby('CustomerId')['TransactionId'].transform('count')

# Standard Deviation of Transaction Amounts per customer
df['TransactionAmountStd'] = df.groupby('CustomerId')['Amount'].transform('std').fillna(0)

In [94]:
## Extract Time-Based Features

In [95]:
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'], errors='coerce')

df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year

In [96]:
## Encode Categorical Variables using WOE

In [97]:
#  Encode Categorical Variables using scorecardpy's WOE
# ---------------------
# Assuming 'FraudResult' is the target variable and 'ProductCategory', 'ProviderId', 'ChannelId' are the features
features = ['ProductCategory', 'ProviderId', 'ChannelId']

# Calculate the WOE and IV for each feature
bins = sc.woebin(df, y='FraudResult', x=features)

# Apply the WOE transformation to the dataset
df_woe = sc.woebin_ply(df, bins)

[INFO] creating woe binning ...


  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols = dat.apply(pd.to_numeric,errors='ignore').select_dtypes(object).apply(pd.to_datetime,errors='ignore').select_dtypes('datetime64').columns.tolist()
  datetime_cols 

[INFO] converting into woe values ...


In [98]:
##  Handle Missing Values

In [99]:
# Fill missing numerical columns with median
for col in ['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']:
    df_woe[col].fillna(df_woe[col].median(), inplace=True)

# Handle missing values for categorical WOE columns
for col in ['ProductCategory_woe', 'CurrencyCode', 'ProviderId_woe', 'ChannelId_woe']:
    df_woe[col].fillna(df_woe[col].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_woe[col].fillna(df_woe[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_woe[col].fillna(df_woe[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [100]:
## Normalize/Standardize Numerical Features

In [101]:
# Standardize (mean=0, std=1) the numerical features
scaler = StandardScaler()
df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']] = scaler.fit_transform(
    df_woe[['Amount', 'Value', 'TotalTransactionAmount', 'AverageTransactionAmount', 'TransactionAmountStd']]
)

In [102]:
# Final check of the dataset
print(df_woe.head())

       TransactionStartTime  TransactionMonth  PricingStrategy  CountryCode  \
0 2018-11-15 02:18:49+00:00                11                2          256   
1 2018-11-15 02:19:08+00:00                11                2          256   
2 2018-11-15 02:44:21+00:00                11                2          256   
3 2018-11-15 03:32:55+00:00                11                2          256   
4 2018-11-15 03:34:21+00:00                11                2          256   

   TransactionDay  TransactionHour  AverageTransactionAmount  \
0              15                2                 -0.067623   
1              15                2                 -0.067623   
2              15                2                 -0.072568   
3              15                3                 -0.008155   
4              15                3                 -0.008155   

   TransactionCount        TransactionId  TotalTransactionAmount  ...  \
0               119  TransactionId_76871                0.170118  .

In [103]:
# Save the processed dataset to CSV 
df_woe.to_csv('../data/proccessed/processed_data.csv', index=False)