In [1]:
!pip install imbalanced-learn==0.12.0 scikit-learn==1.3.2




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
import warnings
import os

warnings.filterwarnings("ignore")
os.makedirs("outputs", exist_ok=True)

In [3]:
print("=" * 60)
print("LOADING AND MERGING DATA")
print("=" * 60)

DATA_PATH = "Data/"  # Update if needed

train_transaction = pd.read_csv(os.path.join(DATA_PATH, "train_transaction.csv"))
train_identity = pd.read_csv(os.path.join(DATA_PATH, "train_identity.csv"))

print(f"Transaction shape: {train_transaction.shape}")
print(f"Identity shape: {train_identity.shape}")

# Merge on TransactionID (left join - not all transactions have identity info)
df = train_transaction.merge(train_identity, on='TransactionID', how='left')
print(f" Merged dataset shape: {df.shape}")

# Store original columns for reference
original_cols = df.columns.tolist()


LOADING AND MERGING DATA
Transaction shape: (590540, 394)
Identity shape: (144233, 41)
 Merged dataset shape: (590540, 434)


In [4]:
print("\n" + "=" * 60)
print(" FEATURE ENGINEERING")
print("=" * 60)

# --- Time-based features from TransactionDT ---
# TransactionDT is seconds from a reference point
df['Transaction_hour'] = (df['TransactionDT'] // 3600) % 24
df['Transaction_day_of_week'] = (df['TransactionDT'] // 86400) % 7
df['Transaction_day'] = df['TransactionDT'] // 86400

# Is weekend?
df['is_weekend'] = (df['Transaction_day_of_week'] >= 5).astype(int)

# Is night transaction (midnight to 6am)?
df['is_night'] = ((df['Transaction_hour'] >= 0) & (df['Transaction_hour'] < 6)).astype(int)

print("Created time-based features:")
print("   - Transaction_hour (0-23)")
print("   - Transaction_day_of_week (0-6)")
print("   - Transaction_day")
print("   - is_weekend (0/1)")
print("   - is_night (0/1)")

# Transaction Amount features
df['TransactionAmt_log'] = np.log1p(df['TransactionAmt'])
df['TransactionAmt_decimal'] = (df['TransactionAmt'] - df['TransactionAmt'].astype(int))
df['TransactionAmt_is_round'] = (df['TransactionAmt_decimal'] == 0).astype(int)

print("Created amount-based features:")
print("   - TransactionAmt_log")
print("   - TransactionAmt_decimal")
print("   - TransactionAmt_is_round")

# Email domain features
def get_email_suffix(email):
    if pd.isna(email):
        return 'missing'
    if '.com' in str(email):
        return 'com'
    elif '.net' in str(email):
        return 'net'
    elif '.org' in str(email):
        return 'org'
    else:
        return 'other'

df['P_email_suffix'] = df['P_emaildomain'].apply(get_email_suffix)
df['R_email_suffix'] = df['R_emaildomain'].apply(get_email_suffix)
df['email_match'] = (df['P_emaildomain'] == df['R_emaildomain']).astype(int)

print("Created email features:")
print("   - P_email_suffix, R_email_suffix")
print("   - email_match")

# Card features
df['card1_card2'] = df['card1'].astype(str) + '_' + df['card2'].astype(str)
df['addr1_addr2'] = df['addr1'].astype(str) + '_' + df['addr2'].astype(str)

print(" Created interaction features:")
print("   - card1_card2, addr1_addr2")

print(f"\n Total features after engineering: {df.shape[1]}")


 FEATURE ENGINEERING
Created time-based features:
   - Transaction_hour (0-23)
   - Transaction_day_of_week (0-6)
   - Transaction_day
   - is_weekend (0/1)
   - is_night (0/1)
Created amount-based features:
   - TransactionAmt_log
   - TransactionAmt_decimal
   - TransactionAmt_is_round
Created email features:
   - P_email_suffix, R_email_suffix
   - email_match
 Created interaction features:
   - card1_card2, addr1_addr2

 Total features after engineering: 447


In [5]:
print("\n" + "=" * 60)
print("MISSING VALUE ANALYSIS")
print("=" * 60)

missing_pct = df.isnull().mean().sort_values(ascending=False)
missing_summary = pd.DataFrame({
    'missing_count': df.isnull().sum(),
    'missing_percent': missing_pct * 100
}).sort_values('missing_percent', ascending=False)

print(f"Features with >90% missing: {(missing_pct > 0.9).sum()}")
print(f"Features with >50% missing: {(missing_pct > 0.5).sum()}")
print(f"Features with >0% missing: {(missing_pct > 0).sum()}")
print(f"Features with 0% missing: {(missing_pct == 0).sum()}")

# Save missing summary
missing_summary.to_csv("outputs/missing_analysis.csv")
print(" Saved: outputs/missing_analysis.csv")



MISSING VALUE ANALYSIS
Features with >90% missing: 12
Features with >50% missing: 214
Features with >0% missing: 414
Features with 0% missing: 33
 Saved: outputs/missing_analysis.csv


In [6]:
print("\n" + "=" * 60)
print("REMOVING HIGH-MISSING COLUMNS (>80% missing)")
print("=" * 60)

MISSING_THRESHOLD = 0.80
cols_to_drop = missing_pct[missing_pct > MISSING_THRESHOLD].index.tolist()

# Keep TransactionID and isFraud regardless
cols_to_drop = [c for c in cols_to_drop if c not in ['TransactionID', 'isFraud']]

print(f"Dropping {len(cols_to_drop)} columns with >{MISSING_THRESHOLD*100}% missing values")
df = df.drop(columns=cols_to_drop)
print(f" Shape after dropping high-missing columns: {df.shape}")


REMOVING HIGH-MISSING COLUMNS (>80% missing)
Dropping 74 columns with >80.0% missing values
 Shape after dropping high-missing columns: (590540, 373)


In [7]:
print("\n" + "=" * 60)
print("5SEPARATING NUMERIC AND CATEGORICAL FEATURES")
print("=" * 60)

# Identify column types
exclude_cols = ['TransactionID', 'isFraud']
feature_cols = [c for c in df.columns if c not in exclude_cols]

numeric_cols = df[feature_cols].select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df[feature_cols].select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")


5SEPARATING NUMERIC AND CATEGORICAL FEATURES
Numeric features: 341
Categorical features: 30


In [8]:
print("\n" + "=" * 60)
print(" HANDLING MISSING VALUES")
print("=" * 60)

# --- Numeric: Impute with median ---
print("Imputing numeric columns with median...")
numeric_imputer = SimpleImputer(strategy='median')
df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
print(f"Imputed {len(numeric_cols)} numeric columns")

# --- Categorical: Impute with 'Unknown' ---
print("Imputing categorical columns with 'Unknown'...")
for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')
print(f"Imputed {len(categorical_cols)} categorical columns")

# Verify no missing values remain
remaining_missing = df.isnull().sum().sum()
print(f"\n Remaining missing values: {remaining_missing}")


 HANDLING MISSING VALUES
Imputing numeric columns with median...
Imputed 341 numeric columns
Imputing categorical columns with 'Unknown'...
Imputed 30 categorical columns

 Remaining missing values: 0


In [9]:
print("\n" + "=" * 60)
print("7ENCODING CATEGORICAL VARIABLES")
print("=" * 60)

# Use Label Encoding for high-cardinality columns
# Use One-Hot for low-cardinality columns
LOW_CARDINALITY_THRESHOLD = 10

low_card_cols = [c for c in categorical_cols if df[c].nunique() <= LOW_CARDINALITY_THRESHOLD]
high_card_cols = [c for c in categorical_cols if df[c].nunique() > LOW_CARDINALITY_THRESHOLD]

print(f"Low-cardinality columns (One-Hot): {len(low_card_cols)}")
print(f"High-cardinality columns (Label Encoding): {len(high_card_cols)}")

# Label encode high-cardinality columns
label_encoders = {}
for col in high_card_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

print(f"Label encoded {len(high_card_cols)} columns")

# One-hot encode low-cardinality columns
df = pd.get_dummies(df, columns=low_card_cols, drop_first=True)
print(f"One-hot encoded {len(low_card_cols)} columns")

print(f"\n Shape after encoding: {df.shape}")


7ENCODING CATEGORICAL VARIABLES
Low-cardinality columns (One-Hot): 24
High-cardinality columns (Label Encoding): 6
Label encoded 6 columns
One-hot encoded 24 columns

 Shape after encoding: (590540, 406)


In [10]:
print("\n" + "=" * 60)
print(" TRAIN/TEST SPLIT")
print("=" * 60)

# Separate features and target
X = df.drop(columns=['TransactionID', 'isFraud'])
y = df['isFraud']

# Stratified split to maintain fraud ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining fraud rate: {y_train.mean()*100:.2f}%")
print(f"Test fraud rate: {y_test.mean()*100:.2f}%")


 TRAIN/TEST SPLIT
Training set: 472432 samples
Test set: 118108 samples

Training fraud rate: 3.50%
Test fraud rate: 3.50%


In [11]:
print("\n" + "=" * 60)
print("HANDLING CLASS IMBALANCE WITH SMOTE")
print("=" * 60)

print(f"Before SMOTE:")
print(f"  Non-Fraud: {(y_train == 0).sum()}")
print(f"  Fraud: {(y_train == 1).sum()}")
print(f"  Ratio: {(y_train == 0).sum() / (y_train == 1).sum():.1f}:1")

# Apply SMOTE only to training data
smote = SMOTE(random_state=42, sampling_strategy=0.5)  # 1:2 ratio fraud:non-fraud
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE:")
print(f"  Non-Fraud: {(y_train_balanced == 0).sum()}")
print(f"  Fraud: {(y_train_balanced == 1).sum()}")
print(f"  Ratio: {(y_train_balanced == 0).sum() / (y_train_balanced == 1).sum():.1f}:1")


HANDLING CLASS IMBALANCE WITH SMOTE
Before SMOTE:
  Non-Fraud: 455902
  Fraud: 16530
  Ratio: 27.6:1

After SMOTE:
  Non-Fraud: 455902
  Fraud: 227951
  Ratio: 2.0:1


In [12]:
print("\n" + "=" * 60)
print("FEATURE SCALING")
print("=" * 60)

scaler = StandardScaler()

# Fit on training data, transform both
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train_balanced),
    columns=X_train_balanced.columns
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test),
    columns=X_test.columns
)

print("Applied StandardScaler")
print(f"   Training mean (sample): {X_train_scaled.iloc[:, 0].mean():.4f}")
print(f"   Training std (sample): {X_train_scaled.iloc[:, 0].std():.4f}")


FEATURE SCALING
Applied StandardScaler
   Training mean (sample): 0.0000
   Training std (sample): 1.0000


In [13]:
print("\n" + "=" * 60)
print(" SAVING PROCESSED DATA")
print("=" * 60)

# Save scaled data (for models sensitive to scale like Logistic Regression, SVM)
X_train_scaled.to_csv("outputs/X_train_scaled.csv", index=False)
X_test_scaled.to_csv("outputs/X_test_scaled.csv", index=False)

# Save unscaled balanced data (for tree-based models)
X_train_balanced.to_csv("outputs/X_train_balanced.csv", index=False)
X_test.to_csv("outputs/X_test.csv", index=False)

# Save labels
y_train_balanced.to_csv("outputs/y_train_balanced.csv", index=False)
y_test.to_csv("outputs/y_test.csv", index=False)

# Save original (unbalanced) training data too
X_train.to_csv("outputs/X_train_original.csv", index=False)
y_train.to_csv("outputs/y_train_original.csv", index=False)

# Save feature names
feature_names = pd.DataFrame({'feature': X_train.columns.tolist()})
feature_names.to_csv("outputs/feature_names.csv", index=False)

print("Saved files:")
print("   - outputs/X_train_scaled.csv (SMOTE + scaled)")
print("   - outputs/X_test_scaled.csv (scaled)")
print("   - outputs/X_train_balanced.csv (SMOTE, unscaled)")
print("   - outputs/X_test.csv (unscaled)")
print("   - outputs/y_train_balanced.csv")
print("   - outputs/y_test.csv")
print("   - outputs/X_train_original.csv (no SMOTE)")
print("   - outputs/y_train_original.csv")
print("   - outputs/feature_names.csv")


 SAVING PROCESSED DATA
Saved files:
   - outputs/X_train_scaled.csv (SMOTE + scaled)
   - outputs/X_test_scaled.csv (scaled)
   - outputs/X_train_balanced.csv (SMOTE, unscaled)
   - outputs/X_test.csv (unscaled)
   - outputs/y_train_balanced.csv
   - outputs/y_test.csv
   - outputs/X_train_original.csv (no SMOTE)
   - outputs/y_train_original.csv
   - outputs/feature_names.csv


In [14]:
print("\n" + "=" * 60)
print("PREPROCESSING SUMMARY")
print("=" * 60)

summary = f"""
Dataset Overview:
-----------------
Original features: 434
Final features: {X_train.shape[1]}
Features dropped (high missing): {len(cols_to_drop)}

Samples:
--------
Total samples: {len(df)}
Training samples (original): {len(X_train)}
Training samples (after SMOTE): {len(X_train_balanced)}
Test samples: {len(X_test)}

Class Distribution:
-------------------
Original fraud rate: 3.50%
Training fraud rate (after SMOTE): {y_train_balanced.mean()*100:.2f}%
Test fraud rate: {y_test.mean()*100:.2f}%

Feature Engineering:
--------------------
- Time features: Transaction_hour, Transaction_day_of_week, is_weekend, is_night
- Amount features: TransactionAmt_log, TransactionAmt_decimal, TransactionAmt_is_round
- Email features: P_email_suffix, R_email_suffix, email_match
- Interaction features: card1_card2, addr1_addr2

Processing Steps:
-----------------
1.  Merged transaction + identity data
2.  Created {8} engineered features
3.  Removed {len(cols_to_drop)} high-missing columns (>80%)
4.  Imputed numeric (median) and categorical (Unknown)
5.  Label encoded high-cardinality categoricals
6.  One-hot encoded low-cardinality categoricals
7.  Train/test split (80/20, stratified)
8.  Applied SMOTE for class balancing
9.  Scaled features with StandardScaler
"""

print(summary)

# Save summary
with open("outputs/preprocessing_summary.txt", "w") as f:
    f.write(summary)
print("Saved: outputs/preprocessing_summary.txt")



PREPROCESSING SUMMARY

Dataset Overview:
-----------------
Original features: 434
Final features: 404
Features dropped (high missing): 74

Samples:
--------
Total samples: 590540
Training samples (original): 472432
Training samples (after SMOTE): 683853
Test samples: 118108

Class Distribution:
-------------------
Original fraud rate: 3.50%
Training fraud rate (after SMOTE): 33.33%
Test fraud rate: 3.50%

Feature Engineering:
--------------------
- Time features: Transaction_hour, Transaction_day_of_week, is_weekend, is_night
- Amount features: TransactionAmt_log, TransactionAmt_decimal, TransactionAmt_is_round
- Email features: P_email_suffix, R_email_suffix, email_match
- Interaction features: card1_card2, addr1_addr2

Processing Steps:
-----------------
1.  Merged transaction + identity data
2.  Created 8 engineered features
3.  Removed 74 high-missing columns (>80%)
4.  Imputed numeric (median) and categorical (Unknown)
5.  Label encoded high-cardinality categoricals
6.  One-hot e