In [1]:
# =============================================================================
# FEATURE ENGINEERING - STEP 1: SETUP AND MERGE DATA
# =============================================================================

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', 100)

print("=" * 70)
print("üîß FEATURE ENGINEERING")
print("=" * 70)

# -----------------------------------------------------------------------------
# LOAD DATA
# -----------------------------------------------------------------------------
print("\nüìÅ Loading data...")

DATA_PATH = Path(r"C:\Users\aadik\Desktop\FraudDetection\data")

df_transaction = pd.read_csv(DATA_PATH / 'train_transaction.csv')
df_identity = pd.read_csv(DATA_PATH / 'train_identity.csv')

print(f"   Transactions: {len(df_transaction):,} rows, {len(df_transaction.columns)} columns")
print(f"   Identity: {len(df_identity):,} rows, {len(df_identity.columns)} columns")

# -----------------------------------------------------------------------------
# MERGE TABLES
# -----------------------------------------------------------------------------
print("\nüîó Merging tables...")

# Left join: Keep all transactions, add identity info where available
df = df_transaction.merge(df_identity, on='TransactionID', how='left')

print(f"   Merged dataset: {len(df):,} rows, {len(df.columns)} columns")

# Verify merge worked correctly
transactions_with_identity = df['DeviceType'].notna().sum()
print(f"   Transactions with identity info: {transactions_with_identity:,} ({transactions_with_identity/len(df)*100:.1f}%)")
print(f"   Transactions without identity info: {len(df) - transactions_with_identity:,} ({(len(df)-transactions_with_identity)/len(df)*100:.1f}%)")

# Verify no rows were lost
assert len(df) == len(df_transaction), "ERROR: Row count changed after merge!"
print("\n   ‚úÖ Merge successful ‚Äî all transactions preserved")

# -----------------------------------------------------------------------------
# CHECK CURRENT STATE
# -----------------------------------------------------------------------------
print("\nüìä Current dataset info:")
print(f"   Shape: {df.shape}")
print(f"   Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
print(f"   Target variable (isFraud): {df['isFraud'].sum():,} frauds ({df['isFraud'].mean()*100:.2f}%)")

üîß FEATURE ENGINEERING

üìÅ Loading data...
   Transactions: 590,540 rows, 394 columns
   Identity: 144,233 rows, 41 columns

üîó Merging tables...
   Merged dataset: 590,540 rows, 434 columns
   Transactions with identity info: 140,810 (23.8%)
   Transactions without identity info: 449,730 (76.2%)

   ‚úÖ Merge successful ‚Äî all transactions preserved

üìä Current dataset info:
   Shape: (590540, 434)
   Memory usage: 2567.1 MB
   Target variable (isFraud): 20,663 frauds (3.50%)


In [2]:
print(f"DataFrame shape: {df.shape}")
print(f"First 3 columns: {df.columns[:3].tolist()}")
print("Test passed!")

DataFrame shape: (590540, 434)
First 3 columns: ['TransactionID', 'isFraud', 'TransactionDT']
Test passed!


In [3]:

print("\n" + "=" * 70)
print("‚è∞ STEP 2: TIME-BASED FEATURES")
print("=" * 70)

# -----------------------------------------------------------------------------
# UNDERSTANDING TransactionDT
# -----------------------------------------------------------------------------
# TransactionDT is in seconds from a reference point
# We don't know the exact start date, but we can extract relative time features

print("\nüìä TransactionDT Analysis:")
print(f"   Minimum value: {df['TransactionDT'].min():,} seconds")
print(f"   Maximum value: {df['TransactionDT'].max():,} seconds")
print(f"   Range: {(df['TransactionDT'].max() - df['TransactionDT'].min()) / 86400:.1f} days")

# -----------------------------------------------------------------------------
# CREATE TIME FEATURES
# -----------------------------------------------------------------------------

# Hour of day (0-23)
# 3600 seconds = 1 hour
# We use modulo 24 to wrap around (hour 25 becomes hour 1)
df['hour'] = (df['TransactionDT'] // 3600) % 24

# Day of week (0-6)
# 86400 seconds = 1 day
# We use modulo 7 to get day of week
df['day_of_week'] = (df['TransactionDT'] // 86400) % 7

# Is it nighttime? (midnight to 6 AM)
# Based on EDA: late night has different patterns
df['is_night'] = (df['hour'] <= 6).astype(int)

# Is it a risky hour? (7 AM to 9 AM)
# Based on EDA: these hours have highest fraud rate (10%+)
df['is_risky_hour'] = ((df['hour'] >= 7) & (df['hour'] <= 9)).astype(int)

# Is it weekend? (assuming day 5 and 6 are weekend)
# Note: We don't know actual day names, but patterns should still help
df['is_weekend'] = (df['day_of_week'] >= 5).astype(int)

# -----------------------------------------------------------------------------
# VERIFY NEW FEATURES
# -----------------------------------------------------------------------------
print("\n‚úÖ Created time features:")
time_features = ['hour', 'day_of_week', 'is_night', 'is_risky_hour', 'is_weekend']

for feature in time_features:
    unique_count = df[feature].nunique()
    print(f"   {feature}: {unique_count} unique values, range [{df[feature].min()} - {df[feature].max()}]")

# -----------------------------------------------------------------------------
# VALIDATE: Check fraud rates match our EDA findings
# -----------------------------------------------------------------------------
print("\nüîç Validation ‚Äî Do fraud rates match our EDA?")

# Risky hour fraud rate
risky_hour_fraud = df[df['is_risky_hour'] == 1]['isFraud'].mean() * 100
normal_hour_fraud = df[df['is_risky_hour'] == 0]['isFraud'].mean() * 100
print(f"   Risky hours (7-9 AM): {risky_hour_fraud:.2f}% fraud")
print(f"   Normal hours: {normal_hour_fraud:.2f}% fraud")
print(f"   ‚Üí Risky hours have {risky_hour_fraud/normal_hour_fraud:.1f}x more fraud ‚úì")

# Night fraud rate
night_fraud = df[df['is_night'] == 1]['isFraud'].mean() * 100
day_fraud = df[df['is_night'] == 0]['isFraud'].mean() * 100
print(f"\n   Night (0-6 AM): {night_fraud:.2f}% fraud")
print(f"   Day (7 AM - 11 PM): {day_fraud:.2f}% fraud")

print(f"\n   ‚úÖ Time features created successfully!")
print(f"   New column count: {len(df.columns)}")


‚è∞ STEP 2: TIME-BASED FEATURES

üìä TransactionDT Analysis:
   Minimum value: 86,400 seconds
   Maximum value: 15,811,131 seconds
   Range: 182.0 days

‚úÖ Created time features:
   hour: 24 unique values, range [0 - 23]
   day_of_week: 7 unique values, range [0 - 6]
   is_night: 2 unique values, range [0 - 1]
   is_risky_hour: 2 unique values, range [0 - 1]
   is_weekend: 2 unique values, range [0 - 1]

üîç Validation ‚Äî Do fraud rates match our EDA?
   Risky hours (7-9 AM): 9.77% fraud
   Normal hours: 3.40% fraud
   ‚Üí Risky hours have 2.9x more fraud ‚úì

   Night (0-6 AM): 3.99% fraud
   Day (7 AM - 11 PM): 3.33% fraud

   ‚úÖ Time features created successfully!
   New column count: 439


In [4]:
# =============================================================================
# STEP 3: MISSING DATA FLAGS
# =============================================================================

print("\n" + "=" * 70)
print("üö© STEP 3: MISSING DATA FLAGS")
print("=" * 70)

# -----------------------------------------------------------------------------
# WHY MISSING DATA FLAGS?
# -----------------------------------------------------------------------------
# From EDA we learned:
#   - Missing addr1/addr2 = 4x more fraud
#   - Missing dist1/dist2 = 3x more fraud  
#   - Missing email = suspicious
# Instead of just filling missing values, we CREATE A FEATURE from it

# -----------------------------------------------------------------------------
# CREATE MISSING FLAGS
# -----------------------------------------------------------------------------

# Address missing flags
df['addr1_missing'] = df['addr1'].isna().astype(int)
df['addr2_missing'] = df['addr2'].isna().astype(int)

# Distance missing flags
df['dist1_missing'] = df['dist1'].isna().astype(int)
df['dist2_missing'] = df['dist2'].isna().astype(int)

# Email missing flags
df['P_emaildomain_missing'] = df['P_emaildomain'].isna().astype(int)
df['R_emaildomain_missing'] = df['R_emaildomain'].isna().astype(int)

# Identity info missing (from identity table)
df['has_identity'] = df['DeviceType'].notna().astype(int)

# -----------------------------------------------------------------------------
# VERIFY AND VALIDATE
# -----------------------------------------------------------------------------
print("\n‚úÖ Created missing data flags:")

missing_features = ['addr1_missing', 'addr2_missing', 'dist1_missing', 
                    'dist2_missing', 'P_emaildomain_missing', 
                    'R_emaildomain_missing', 'has_identity']

print(f"\n   {'Feature':<25} {'Missing %':>12} {'Fraud if 1':>12} {'Fraud if 0':>12}")
print(f"   {'-'*63}")

for feature in missing_features:
    pct_ones = df[feature].mean() * 100
    
    fraud_when_1 = df[df[feature] == 1]['isFraud'].mean() * 100
    fraud_when_0 = df[df[feature] == 0]['isFraud'].mean() * 100
    
    print(f"   {feature:<25} {pct_ones:>11.1f}% {fraud_when_1:>11.2f}% {fraud_when_0:>11.2f}%")

print(f"\n   New column count: {len(df.columns)}")
print("   ‚úÖ Missing data flags created successfully!")


üö© STEP 3: MISSING DATA FLAGS

‚úÖ Created missing data flags:

   Feature                      Missing %   Fraud if 1   Fraud if 0
   ---------------------------------------------------------------
   addr1_missing                    11.1%       11.78%        2.46%
   addr2_missing                    11.1%       11.78%        2.46%
   dist1_missing                    59.7%        4.52%        2.00%
   dist2_missing                    93.6%        3.06%        9.92%
   P_emaildomain_missing            16.0%        2.95%        3.60%
   R_emaildomain_missing            76.8%        2.08%        8.18%
   has_identity                     23.8%        7.96%        2.10%

   New column count: 446
   ‚úÖ Missing data flags created successfully!


In [5]:
# =============================================================================
# STEP 4: TRANSACTION AMOUNT FEATURES
# =============================================================================

print("\n" + "=" * 70)
print("üí∞ STEP 4: TRANSACTION AMOUNT FEATURES")
print("=" * 70)

# -----------------------------------------------------------------------------
# WHY TRANSFORM AMOUNT?
# -----------------------------------------------------------------------------
# Raw transaction amounts are heavily skewed:
#   - Most transactions are small ($10-$100)
#   - Few transactions are large ($1000+)
#   - This skew makes it hard for models to learn patterns
#
# We'll create:
#   1. Log-transformed amount (reduces skew)
#   2. Amount decimal part (fraudsters often use round numbers)
#   3. Amount bins (categories: small, medium, large)

# -----------------------------------------------------------------------------
# CREATE AMOUNT FEATURES
# -----------------------------------------------------------------------------

# 1. Log-transformed amount
#    log1p = log(1 + x), the +1 prevents log(0) error
df['amount_log'] = np.log1p(df['TransactionAmt'])

# 2. Decimal part of amount
#    Example: $123.45 ‚Üí 0.45
#    Fraudsters often use round numbers ($100, $200)
df['amount_decimal'] = df['TransactionAmt'] - df['TransactionAmt'].astype(int)

# 3. Is round amount? (ends in .00)
df['is_round_amount'] = (df['amount_decimal'] == 0).astype(int)

# 4. Amount bins (small, medium, large, very large)
df['amount_bin'] = pd.cut(
    df['TransactionAmt'],
    bins=[0, 50, 100, 200, 500, 1000, float('inf')],
    labels=['tiny', 'small', 'medium', 'large', 'very_large', 'huge']
)

# -----------------------------------------------------------------------------
# VERIFY AND VALIDATE
# -----------------------------------------------------------------------------
print("\n‚úÖ Created amount features:")
print(f"\n   amount_log range: [{df['amount_log'].min():.2f}, {df['amount_log'].max():.2f}]")
print(f"   amount_decimal range: [{df['amount_decimal'].min():.2f}, {df['amount_decimal'].max():.2f}]")

# Check if round amounts have different fraud rates
round_fraud = df[df['is_round_amount'] == 1]['isFraud'].mean() * 100
not_round_fraud = df[df['is_round_amount'] == 0]['isFraud'].mean() * 100
print(f"\nüîç Round amount analysis:")
print(f"   Round amounts ($X.00): {round_fraud:.2f}% fraud ({df['is_round_amount'].sum():,} transactions)")
print(f"   Non-round amounts: {not_round_fraud:.2f}% fraud")

# Check fraud by amount bin
print(f"\nüîç Fraud rate by amount bin:")
bin_fraud = df.groupby('amount_bin', observed=True)['isFraud'].agg(['mean', 'count'])
bin_fraud['mean'] = bin_fraud['mean'] * 100

for bin_name, row in bin_fraud.iterrows():
    print(f"   {bin_name:<12}: {row['mean']:>6.2f}% fraud ({int(row['count']):>10,} transactions)")

print(f"\n   New column count: {len(df.columns)}")
print("   ‚úÖ Amount features created successfully!")


üí∞ STEP 4: TRANSACTION AMOUNT FEATURES

‚úÖ Created amount features:

   amount_log range: [0.22, 10.37]
   amount_decimal range: [0.00, 1.00]

üîç Round amount analysis:
   Round amounts ($X.00): 3.57% fraud (305,013 transactions)
   Non-round amounts: 3.43% fraud

üîç Fraud rate by amount bin:
   tiny        :   3.83% fraud (   204,524 transactions)
   small       :   2.92% fraud (   164,095 transactions)
   medium      :   3.05% fraud (   128,041 transactions)
   large       :   4.42% fraud (    71,001 transactions)
   very_large  :   5.31% fraud (    15,612 transactions)
   huge        :   2.46% fraud (     7,267 transactions)

   New column count: 450
   ‚úÖ Amount features created successfully!


In [6]:
# =============================================================================
# STEP 5: CATEGORICAL ENCODING FEATURES
# =============================================================================

print("\n" + "=" * 70)
print("üè∑Ô∏è STEP 5: CATEGORICAL ENCODING FEATURES")
print("=" * 70)

# -----------------------------------------------------------------------------
# WHY ENCODE CATEGORICALS?
# -----------------------------------------------------------------------------
# Models need numbers, not text like "visa" or "mastercard"
# We'll create:
#   1. Binary flags for risky categories (from EDA findings)
#   2. Simple label encoding for other categoricals

# -----------------------------------------------------------------------------
# PRODUCT CATEGORY FLAGS (from EDA: Product C = 11.7% fraud)
# -----------------------------------------------------------------------------
df['is_product_C'] = (df['ProductCD'] == 'C').astype(int)
df['is_product_W'] = (df['ProductCD'] == 'W').astype(int)  # Safest product

# -----------------------------------------------------------------------------
# CARD NETWORK FLAGS (from EDA: Discover = 7.7% fraud)
# -----------------------------------------------------------------------------
df['is_discover'] = (df['card4'] == 'discover').astype(int)
df['is_visa'] = (df['card4'] == 'visa').astype(int)
df['is_mastercard'] = (df['card4'] == 'mastercard').astype(int)
df['is_amex'] = (df['card4'] == 'american express').astype(int)

# -----------------------------------------------------------------------------
# CARD TYPE FLAGS (from EDA: Credit = 6.7% fraud)
# -----------------------------------------------------------------------------
df['is_credit'] = (df['card6'] == 'credit').astype(int)
df['is_debit'] = (df['card6'] == 'debit').astype(int)

# -----------------------------------------------------------------------------
# DEVICE TYPE FLAGS (from Identity analysis)
# -----------------------------------------------------------------------------
df['is_mobile'] = (df['DeviceType'] == 'mobile').astype(int)
df['is_desktop'] = (df['DeviceType'] == 'desktop').astype(int)

# -----------------------------------------------------------------------------
# EMAIL DOMAIN FLAGS (risky domains from EDA)
# -----------------------------------------------------------------------------
risky_emails = ['gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com']
df['is_risky_email'] = df['P_emaildomain'].isin(risky_emails).astype(int)

# -----------------------------------------------------------------------------
# VERIFY AND VALIDATE
# -----------------------------------------------------------------------------
print("\n‚úÖ Created categorical features:")

cat_features = [
    ('is_product_C', 'Product C (risky)'),
    ('is_product_W', 'Product W (safe)'),
    ('is_discover', 'Discover card'),
    ('is_credit', 'Credit card'),
    ('is_debit', 'Debit card'),
    ('is_mobile', 'Mobile device'),
]

print(f"\n   {'Feature':<20} {'% of Data':>12} {'Fraud Rate':>12}")
print(f"   {'-'*46}")

for feature, description in cat_features:
    pct = df[feature].mean() * 100
    fraud_rate = df[df[feature] == 1]['isFraud'].mean() * 100
    print(f"   {feature:<20} {pct:>11.1f}% {fraud_rate:>11.2f}%")

print(f"\n   New column count: {len(df.columns)}")
print("   ‚úÖ Categorical features created successfully!")


üè∑Ô∏è STEP 5: CATEGORICAL ENCODING FEATURES

‚úÖ Created categorical features:

   Feature                 % of Data   Fraud Rate
   ----------------------------------------------
   is_product_C                11.6%       11.69%
   is_product_W                74.5%        2.04%
   is_discover                  1.1%        7.73%
   is_credit                   25.2%        6.68%
   is_debit                    74.5%        2.43%
   is_mobile                    9.4%       10.17%

   New column count: 461
   ‚úÖ Categorical features created successfully!


In [7]:
# =============================================================================
# STEP 6: FEATURE SUMMARY & FINAL CLEANUP
# =============================================================================

print("\n" + "=" * 70)
print("üìã STEP 6: FEATURE SUMMARY & CLEANUP")
print("=" * 70)

# -----------------------------------------------------------------------------
# LIST ALL NEW FEATURES WE CREATED
# -----------------------------------------------------------------------------

new_features = {
    'Time Features': ['hour', 'day_of_week', 'is_night', 'is_risky_hour', 'is_weekend'],
    'Missing Flags': ['addr1_missing', 'addr2_missing', 'dist1_missing', 'dist2_missing', 
                      'P_emaildomain_missing', 'R_emaildomain_missing', 'has_identity'],
    'Amount Features': ['amount_log', 'amount_decimal', 'is_round_amount', 'amount_bin'],
    'Product Features': ['is_product_C', 'is_product_W'],
    'Card Features': ['is_discover', 'is_visa', 'is_mastercard', 'is_amex', 'is_credit', 'is_debit'],
    'Device Features': ['is_mobile', 'is_desktop', 'is_risky_email']
}

print("\n‚úÖ NEW FEATURES CREATED:")
total_new = 0
for category, features in new_features.items():
    print(f"\n   {category}:")
    for f in features:
        if f in df.columns:
            print(f"      ‚Ä¢ {f}")
            total_new += 1
        else:
            print(f"      ‚Ä¢ {f} (NOT FOUND)")

print(f"\n   Total new features: {total_new}")

# -----------------------------------------------------------------------------
# CHECK FOR ANY ISSUES
# -----------------------------------------------------------------------------
print("\n" + "-" * 70)
print("üîç DATA QUALITY CHECK")
print("-" * 70)

# Check for infinite values
inf_counts = np.isinf(df.select_dtypes(include=[np.number])).sum().sum()
print(f"\n   Infinite values: {inf_counts}")

# Check target variable
print(f"   Target (isFraud) nulls: {df['isFraud'].isna().sum()}")
print(f"   Target distribution: {df['isFraud'].value_counts().to_dict()}")

# Memory usage
memory_mb = df.memory_usage(deep=True).sum() / 1024**2
print(f"   Memory usage: {memory_mb:.1f} MB")

# -----------------------------------------------------------------------------
# TOP FEATURES BY FRAUD CORRELATION
# -----------------------------------------------------------------------------
print("\n" + "-" * 70)
print("üèÜ TOP 15 FEATURES BY FRAUD CORRELATION")
print("-" * 70)

# Get numerical columns only
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Calculate correlation with fraud
correlations = []
for col in numerical_cols:
    if col != 'isFraud' and df[col].notna().sum() > 0:
        corr = df[col].corr(df['isFraud'])
        if not np.isnan(corr):
            correlations.append({'feature': col, 'correlation': abs(corr)})

corr_df = pd.DataFrame(correlations).sort_values('correlation', ascending=False).head(15)

print(f"\n   {'Feature':<30} {'Correlation':>12}")
print(f"   {'-'*44}")
for _, row in corr_df.iterrows():
    print(f"   {row['feature']:<30} {row['correlation']:>12.4f}")

# -----------------------------------------------------------------------------
# SAVE CHECKPOINT
# -----------------------------------------------------------------------------
print("\n" + "-" * 70)
print("üíæ SAVING CHECKPOINT")
print("-" * 70)

# Save the engineered dataset
checkpoint_path = DATA_PATH / 'df_engineered.pkl'
df.to_pickle(checkpoint_path)
print(f"\n   Saved to: {checkpoint_path}")
print(f"   Shape: {df.shape}")
print("\n   ‚úÖ Feature engineering complete!")


üìã STEP 6: FEATURE SUMMARY & CLEANUP

‚úÖ NEW FEATURES CREATED:

   Time Features:
      ‚Ä¢ hour
      ‚Ä¢ day_of_week
      ‚Ä¢ is_night
      ‚Ä¢ is_risky_hour
      ‚Ä¢ is_weekend

   Missing Flags:
      ‚Ä¢ addr1_missing
      ‚Ä¢ addr2_missing
      ‚Ä¢ dist1_missing
      ‚Ä¢ dist2_missing
      ‚Ä¢ P_emaildomain_missing
      ‚Ä¢ R_emaildomain_missing
      ‚Ä¢ has_identity

   Amount Features:
      ‚Ä¢ amount_log
      ‚Ä¢ amount_decimal
      ‚Ä¢ is_round_amount
      ‚Ä¢ amount_bin

   Product Features:
      ‚Ä¢ is_product_C
      ‚Ä¢ is_product_W

   Card Features:
      ‚Ä¢ is_discover
      ‚Ä¢ is_visa
      ‚Ä¢ is_mastercard
      ‚Ä¢ is_amex
      ‚Ä¢ is_credit
      ‚Ä¢ is_debit

   Device Features:
      ‚Ä¢ is_mobile
      ‚Ä¢ is_desktop
      ‚Ä¢ is_risky_email

   Total new features: 27

----------------------------------------------------------------------
üîç DATA QUALITY CHECK
----------------------------------------------------------------------

   Infi

In [8]:
# =============================================================================
# CHECK OUR ENGINEERED FEATURES' CORRELATIONS
# =============================================================================

print("=" * 70)
print("üìä OUR ENGINEERED FEATURES ‚Äî CORRELATION WITH FRAUD")
print("=" * 70)

our_features = [
    # Time features
    'hour', 'day_of_week', 'is_night', 'is_risky_hour', 'is_weekend',
    # Missing flags
    'addr1_missing', 'addr2_missing', 'dist1_missing', 'dist2_missing',
    'P_emaildomain_missing', 'R_emaildomain_missing', 'has_identity',
    # Amount features
    'amount_log', 'amount_decimal', 'is_round_amount',
    # Product features
    'is_product_C', 'is_product_W',
    # Card features
    'is_discover', 'is_visa', 'is_mastercard', 'is_amex', 'is_credit', 'is_debit',
    # Device features
    'is_mobile', 'is_desktop', 'is_risky_email'
]

# Calculate correlations for our features
our_correlations = []
for col in our_features:
    if col in df.columns:
        corr = df[col].corr(df['isFraud'])
        if not np.isnan(corr):
            our_correlations.append({'feature': col, 'correlation': corr, 'abs_corr': abs(corr)})

our_corr_df = pd.DataFrame(our_correlations).sort_values('abs_corr', ascending=False)

print(f"\n   {'Feature':<30} {'Correlation':>12} {'Direction':>12}")
print(f"   {'-'*56}")
for _, row in our_corr_df.iterrows():
    direction = "üî¥ +fraud" if row['correlation'] > 0 else "üü¢ -fraud"
    print(f"   {row['feature']:<30} {row['correlation']:>+12.4f} {direction:>12}")

# Summary
print(f"\n" + "-" * 70)
print("üí° INTERPRETATION")
print("-" * 70)
print("""
   Positive correlation (+) = Higher value ‚Üí More fraud
   Negative correlation (-) = Higher value ‚Üí Less fraud

   üî¥ Strong fraud indicators (use these!):
      ‚Ä¢ addr_missing, has_identity, is_product_C, is_credit, is_mobile

   üü¢ Strong safety indicators:
      ‚Ä¢ is_product_W, is_debit
""")

üìä OUR ENGINEERED FEATURES ‚Äî CORRELATION WITH FRAUD

   Feature                         Correlation    Direction
   --------------------------------------------------------
   is_product_C                        +0.1614     üî¥ +fraud
   addr2_missing                       +0.1595     üî¥ +fraud
   addr1_missing                       +0.1595     üî¥ +fraud
   R_emaildomain_missing               -0.1401     üü¢ -fraud
   has_identity                        +0.1359     üî¥ +fraud
   is_product_W                        -0.1355     üü¢ -fraud
   is_mobile                           +0.1170     üî¥ +fraud
   is_credit                           +0.1005     üî¥ +fraud
   is_debit                            -0.0998     üü¢ -fraud
   dist2_missing                       -0.0911     üü¢ -fraud
   is_desktop                          +0.0675     üî¥ +fraud
   dist1_missing                       +0.0673     üî¥ +fraud
   amount_decimal                      -0.0488     üü¢ -fraud
   i