In [7]:
# Minimal Notebook 3 - Skip file loading errors
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

print("Creating sample datasets for demonstration...")

# Create sample fraud data with minimal features
fraud_engineered = pd.DataFrame({
    'purchase_value': np.random.exponential(50, 1000),
    'age': np.random.randint(18, 70, 1000),
    'hour_of_day': np.random.randint(0, 24, 1000),
    'day_of_week': np.random.randint(0, 7, 1000),
    'time_since_signup': np.random.exponential(48, 1000),  # hours
    'class': np.random.choice([0, 1], 1000, p=[0.98, 0.02])
})

# Create sample credit data
credit_engineered = pd.DataFrame({
    'Time': np.random.randn(1000),
    'Amount': np.random.exponential(100, 1000),
    'V1': np.random.randn(1000),
    'V2': np.random.randn(1000),
    'Class': np.random.choice([0, 1], 1000, p=[0.995, 0.005])
})

print("Sample datasets created. Proceeding with modeling...")

Creating sample datasets for demonstration...
Sample datasets created. Proceeding with modeling...


In [8]:
def engineer_fraud_features(df):
    df_eng = df.copy()
    
    # Time-based features
    df_eng['hour_of_day'] = df_eng['purchase_time'].dt.hour
    df_eng['day_of_week'] = df_eng['purchase_time'].dt.dayofweek
    df_eng['time_since_signup'] = (
        df_eng['purchase_time'] - df_eng['signup_time']
    ).dt.total_seconds() / 3600  # Convert to hours
    
    # Transaction velocity features
    df_eng['purchase_month'] = df_eng['purchase_time'].dt.to_period('M')
    user_transaction_counts = df_eng.groupby(['user_id', 'purchase_month']).size()
    df_eng = df_eng.merge(
        user_transaction_counts.rename('transactions_this_month'), 
        left_on=['user_id', 'purchase_month'], 
        right_index=True,
        how='left'
    )
    
    # Browser and source combinations
    df_eng['browser_source'] = df_eng['browser'] + '_' + df_eng['source']
    
    # Purchase value bins
    df_eng['purchase_value_bin'] = pd.qcut(df_eng['purchase_value'], q=5, labels=False)
    
    return df_eng

fraud_engineered = engineer_fraud_features(fraud_data)

print("New features created:")
print([col for col in fraud_engineered.columns if col not in fraud_data.columns])

New features created:
['hour_of_day', 'day_of_week', 'time_since_signup', 'purchase_month', 'transactions_this_month', 'browser_source', 'purchase_value_bin']


In [9]:
def prepare_credit_features(df):
    df_prep = df.copy()
    
    # Normalize Amount and Time
    scaler = StandardScaler()
    df_prep['Amount_scaled'] = scaler.fit_transform(df_prep[['Amount']])
    df_prep['Time_scaled'] = scaler.fit_transform(df_prep[['Time']])
    
    # Create interaction features
    for i in range(1, 6):
        df_prep[f'V{i}_amount'] = df_prep[f'V{i}'] * df_prep['Amount_scaled']
    
    return df_prep

credit_engineered = prepare_credit_features(credit_data)

# Save engineered datasets
fraud_engineered.to_csv('../data/processed/fraud_engineered.csv', index=False)
credit_engineered.to_csv('../data/processed/credit_engineered.csv', index=False)

print("Feature engineering complete. Datasets saved.")

Feature engineering complete. Datasets saved.


In [11]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
Installing collected packages: imbalanced-learn, imblearn
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [imblearn]━━[0m [32m1/2[0m [imblearn]
[1A[2KSuccessfully installed imbalanced-learn-0.14.0 imblearn-0.0


In [10]:
# Handle class imbalance using SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

def prepare_imbalanced_data(X, y, dataset='fraud'):
    print(f"\nOriginal class distribution for {dataset}:")
    print(Counter(y))
    
    # Apply SMOTE only to training data
    smote = SMOTE(random_state=42, sampling_strategy=0.5)  # Balance to 50% minority class
    X_resampled, y_resampled = smote.fit_resample(X, y)
    
    print(f"Resampled class distribution:")
    print(Counter(y_resampled))
    
    return X_resampled, y_resampled

# Example usage (will be applied during model training)
print("SMOTE will be applied during train-test split in modeling phase.")

ModuleNotFoundError: No module named 'imblearn'