In [9]:
"""
Task 3 - Feature Engineering with sklearn Pipeline
Fixed version with proper DataFrame handling
"""

import pandas as pd
import numpy as np
import os
import sys
import json
import warnings
warnings.filterwarnings('ignore')

# Import sklearn components directly
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin

print("=" * 80)
print("TASK 3 - FEATURE ENGINEERING WITH SKLEARN PIPELINE")
print("Credit Risk Model using Alternative Data")
print("=" * 80)

# -----------------------------
# 1. Custom Transformers (Fixed for Pipeline)
# -----------------------------

class DateTimeFeatureExtractor(BaseEstimator, TransformerMixin):
    """Extract temporal features from transaction timestamp"""
    
    def __init__(self, datetime_col="TransactionStartTime"):
        self.datetime_col = datetime_col
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        
        if not pd.api.types.is_datetime64_any_dtype(X[self.datetime_col]):
            X[self.datetime_col] = pd.to_datetime(X[self.datetime_col], errors='coerce')
        
        # Extract all required temporal features
        X["transaction_hour"] = X[self.datetime_col].dt.hour
        X["transaction_day"] = X[self.datetime_col].dt.day
        X["transaction_month"] = X[self.datetime_col].dt.month
        X["transaction_year"] = X[self.datetime_col].dt.year
        X["transaction_dayofweek"] = X[self.datetime_col].dt.dayofweek
        
        return X


class RFMFeatureAggregator(BaseEstimator, TransformerMixin):
    """Create RFM (Recency, Frequency, Monetary) features at customer level"""
    
    def __init__(self, customer_id_col="CustomerId", amount_col="Amount"):
        self.customer_id_col = customer_id_col
        self.amount_col = amount_col
        self.snapshot_date = None
        
    def fit(self, X, y=None):
        # Determine snapshot date for recency calculation
        if 'TransactionStartTime' in X.columns:
            self.snapshot_date = pd.to_datetime(X['TransactionStartTime']).max()
        else:
            self.snapshot_date = pd.Timestamp.now()
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        
        # Ensure numeric amount
        X[self.amount_col] = pd.to_numeric(X[self.amount_col], errors='coerce')
        
        # Group by customer and aggregate
        agg_dict = {
            self.amount_col: [
                ('total_amount', 'sum'),
                ('avg_amount', 'mean'),
                ('std_amount', 'std'),
                ('min_amount', 'min'),
                ('max_amount', 'max'),
                ('transaction_count', 'count')
            ]
        }
        
        # Add datetime aggregation if available
        if 'TransactionStartTime' in X.columns:
            agg_dict['TransactionStartTime'] = [
                ('first_transaction', 'min'),
                ('last_transaction', 'max')
            ]
        
        # Add categorical aggregations if available
        categorical_cols = ['ProductCategory', 'ChannelId', 'CurrencyCode', 'CountryCode']
        for col in categorical_cols:
            if col in X.columns:
                agg_dict[col] = lambda x: x.mode()[0] if not x.mode().empty else 'Unknown'
        
        # Perform aggregation
        customer_df = X.groupby(self.customer_id_col).agg(agg_dict)
        customer_df.columns = ['_'.join(col).strip() for col in customer_df.columns.values]
        customer_df = customer_df.reset_index()
        
        # Calculate derived RFM features
        # Recency
        if 'TransactionStartTime_last_transaction' in customer_df.columns:
            last_transaction = pd.to_datetime(customer_df['TransactionStartTime_last_transaction'])
            customer_df['recency_days'] = (self.snapshot_date - last_transaction).dt.days
        
        # Frequency (if tenure available)
        if all(col in customer_df.columns for col in ['TransactionStartTime_last_transaction', 
                                                     'TransactionStartTime_first_transaction']):
            last_transaction = pd.to_datetime(customer_df['TransactionStartTime_last_transaction'])
            first_transaction = pd.to_datetime(customer_df['TransactionStartTime_first_transaction'])
            tenure_days = (last_transaction - first_transaction).dt.days
            customer_df['customer_tenure_days'] = np.maximum(tenure_days, 1)
            customer_df['frequency_per_day'] = (
                customer_df['Amount_transaction_count'] / customer_df['customer_tenure_days']
            )
        
        # Handle NaN in std (customers with 1 transaction)
        if 'Amount_std_amount' in customer_df.columns:
            customer_df['Amount_std_amount'] = customer_df['Amount_std_amount'].fillna(0)
        
        # Rename columns for consistency
        rename_dict = {
            'Amount_total_amount': 'total_transaction_amount',
            'Amount_avg_amount': 'avg_transaction_amount',
            'Amount_std_amount': 'std_transaction_amount',
            'Amount_min_amount': 'min_transaction_amount',
            'Amount_max_amount': 'max_transaction_amount',
            'Amount_transaction_count': 'transaction_count',
            'ProductCategory_<lambda>': 'most_common_productcategory',
            'ChannelId_<lambda>': 'most_common_channelid',
            'CurrencyCode_<lambda>': 'most_common_currencycode',
            'CountryCode_<lambda>': 'most_common_countrycode'
        }
        
        for old, new in rename_dict.items():
            if old in customer_df.columns:
                customer_df = customer_df.rename(columns={old: new})
        
        return customer_df


class DataFrameColumnSelector(BaseEstimator, TransformerMixin):
    """Select specific columns from a DataFrame and return as DataFrame"""
    
    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            return X[self.columns]
        else:
            # If X is numpy array, convert to DataFrame first
            return pd.DataFrame(X, columns=self.columns)


# -----------------------------
# 2. Configuration
# -----------------------------
print("\n1. CONFIGURING PIPELINE")
print("-" * 40)

config = {
    'imputation_strategy': 'median',
    'scaling_strategy': 'standard',
    'encoding_strategy': 'onehot',
    'handle_missing': 'impute'
}

print(f"Pipeline Configuration:")
for key, value in config.items():
    print(f"  {key:20}: {value}")

# -----------------------------
# 3. Load Data
# -----------------------------
print("\n2. LOADING DATA")
print("-" * 40)

def create_sample_data(n_transactions=10000, n_customers=200):
    """Create synthetic transaction data for testing"""
    np.random.seed(42)
    
    customer_ids = [f'CUST{str(i).zfill(5)}' for i in range(n_customers)]
    
    start_date = pd.Timestamp('2023-01-01')
    end_date = pd.Timestamp('2024-01-01')
    dates = pd.date_range(start=start_date, end=end_date, periods=n_transactions)
    
    data = {
        'TransactionId': [f'TX{str(i).zfill(7)}' for i in range(n_transactions)],
        'CustomerId': np.random.choice(customer_ids, n_transactions, 
                                      p=np.random.dirichlet(np.ones(n_customers))),
        'TransactionStartTime': np.random.choice(dates, n_transactions),
        'Amount': np.random.exponential(5000, n_transactions) * 
                  np.random.choice([1, -1], n_transactions, p=[0.95, 0.05]),
        'Value': np.abs(np.random.exponential(5000, n_transactions)),
        'ProductCategory': np.random.choice(
            ['Communications', 'Groceries', 'Entertainment', 'Transport', 'Electronics'], 
            n_transactions, p=[0.35, 0.25, 0.15, 0.15, 0.10]
        ),
        'ChannelId': np.random.choice(['Android', 'Web', 'iOS', 'Pay Later'], 
                                     n_transactions, p=[0.45, 0.30, 0.15, 0.10]),
        'FraudResult': np.random.binomial(1, 0.01, n_transactions),
        'CurrencyCode': ['UGX'] * n_transactions,
        'CountryCode': ['UG'] * n_transactions
    }
    
    return pd.DataFrame(data)

# Load data
data_paths = ["data/raw/data.csv", "../data/raw/data.csv"]
raw_data = None

for path in data_paths:
    if os.path.exists(path):
        print(f"Loading data from {path}...")
        raw_data = pd.read_csv(path, nrows=20000)
        print(f"‚úì Loaded {len(raw_data):,} transactions")
        break

if raw_data is None:
    print("No data file found. Creating sample data...")
    raw_data = create_sample_data(n_transactions=10000, n_customers=500)
    print(f"‚úì Created sample data with {len(raw_data):,} transactions")

print(f"\nData Statistics:")
print(f"  Shape: {raw_data.shape}")
print(f"  Columns: {raw_data.columns.tolist()}")

# -----------------------------
# 4. Build and Execute Pipeline - SIMPLIFIED APPROACH
# -----------------------------
print("\n3. BUILDING AND EXECUTING PIPELINE")
print("-" * 40)

print("Step 1: Execute first stage (cleaning and aggregation)...")
datetime_extractor = DateTimeFeatureExtractor()
rfm_aggregator = RFMFeatureAggregator()

# Apply first stage
df_with_dates = datetime_extractor.fit_transform(raw_data)
customer_data = rfm_aggregator.fit_transform(df_with_dates)

print(f"‚úì Customer data shape after aggregation: {customer_data.shape}")
print(f"‚úì Sample columns: {customer_data.columns.tolist()[:10]}...")

# -----------------------------
# 5. Identify Feature Types
# -----------------------------
print("\n4. IDENTIFYING FEATURE TYPES")
print("-" * 40)

# Identify numerical and categorical columns
numerical_features = []
categorical_features = []

for col in customer_data.columns:
    if col == 'CustomerId':
        continue
    if customer_data[col].dtype in ['int64', 'float64']:
        numerical_features.append(col)
    elif customer_data[col].dtype in ['object', 'category']:
        categorical_features.append(col)

print(f"Numerical features ({len(numerical_features)}):")
for feat in numerical_features[:10]:  # Show first 10
    print(f"  - {feat}")
if len(numerical_features) > 10:
    print(f"  ... and {len(numerical_features) - 10} more")

print(f"\nCategorical features ({len(categorical_features)}):")
for feat in categorical_features:
    print(f"  - {feat}")

# -----------------------------
# 6. Build sklearn Pipeline for Processing
# -----------------------------
print("\n5. BUILDING SKLEARN PIPELINE")
print("-" * 40)

# Remove CustomerId from features for processing
X_processed = customer_data.drop(columns=['CustomerId']).copy()

# Build numerical pipeline
if config['imputation_strategy'] == 'knn':
    num_imputer = KNNImputer(n_neighbors=5)
else:
    num_imputer = SimpleImputer(strategy=config['imputation_strategy'])

if config['scaling_strategy'] == 'minmax':
    scaler = MinMaxScaler()
else:
    scaler = StandardScaler()

numerical_pipeline = Pipeline([
    ('imputer', num_imputer),
    ('scaler', scaler)
])

# Build categorical pipeline using sklearn's OneHotEncoder directly
categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Create column transformer
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

print(f"‚úì Built ColumnTransformer with:")
print(f"  - Numerical pipeline: Imputer ‚Üí Scaler")
print(f"  - Categorical pipeline: Imputer ‚Üí OneHotEncoder")

# -----------------------------
# 7. Execute sklearn Pipeline
# -----------------------------
print("\n6. EXECUTING SKLEARN PIPELINE")
print("-" * 40)

print("Fitting and transforming data...")
X_transformed = preprocessor.fit_transform(X_processed)

print(f"‚úì Transformed data shape: {X_transformed.shape}")
print(f"‚úì Output type: {type(X_transformed)}")

# Get feature names
feature_names = []
try:
    # Get numerical feature names
    feature_names.extend(numerical_features)
    
    # Get categorical feature names
    if categorical_features and hasattr(preprocessor.named_transformers_['cat'], 'named_steps'):
        onehot = preprocessor.named_transformers_['cat'].named_steps['onehot']
        cat_feature_names = onehot.get_feature_names_out(categorical_features)
        feature_names.extend(cat_feature_names)
    
    print(f"‚úì Generated {len(feature_names)} feature names")
    
except Exception as e:
    print(f"‚ö† Could not generate all feature names: {e}")
    # Create generic names
    feature_names = [f"feature_{i}" for i in range(X_transformed.shape[1])]

# -----------------------------
# 8. Create Complete Pipeline for Reusability
# -----------------------------
print("\n7. CREATING COMPLETE PIPELINE OBJECT")
print("-" * 40)

# Create a complete pipeline that can be reused
complete_pipeline = Pipeline([
    ('datetime_extractor', DateTimeFeatureExtractor()),
    ('rfm_aggregator', RFMFeatureAggregator()),
    ('column_selector', DataFrameColumnSelector(columns=numerical_features + categorical_features)),
    ('preprocessor', preprocessor)
])

print(f"‚úì Created complete pipeline with {len(complete_pipeline.steps)} steps:")
for i, (step_name, step) in enumerate(complete_pipeline.steps, 1):
    print(f"  {i}. {step_name:20} ‚Üí {step.__class__.__name__}")

# Verify it's a sklearn Pipeline
print(f"\n‚úÖ Verification: Is sklearn Pipeline? {isinstance(complete_pipeline, Pipeline)}")

# -----------------------------
# 9. Save Results
# -----------------------------
print("\n8. SAVING RESULTS")
print("-" * 40)

# Create directories
os.makedirs('data/processed', exist_ok=True)
os.makedirs('models', exist_ok=True)

# Save processed features
processed_df = pd.DataFrame(X_transformed, columns=feature_names)
processed_df['CustomerId'] = customer_data['CustomerId'].values  # Add CustomerId back

processed_path = 'data/processed/task3_features_final.csv'
processed_df.to_csv(processed_path, index=False)
print(f"‚úì Saved processed features to '{processed_path}'")

# Save customer-level data
customer_data_path = 'data/processed/task3_customer_features.csv'
customer_data.to_csv(customer_data_path, index=False)
print(f"‚úì Saved customer features to '{customer_data_path}'")

# Save pipeline
import joblib
pipeline_path = 'models/task3_feature_pipeline.pkl'
joblib.dump(complete_pipeline, pipeline_path)
print(f"‚úì Saved pipeline to '{pipeline_path}'")

# Save preprocessor separately
preprocessor_path = 'models/task3_preprocessor.pkl'
joblib.dump(preprocessor, preprocessor_path)
print(f"‚úì Saved preprocessor to '{preprocessor_path}'")

# Save configuration and metadata
metadata = {
    'task': 'Task 3 - Feature Engineering',
    'timestamp': pd.Timestamp.now().isoformat(),
    'data_source': 'Xente eCommerce Transactions',
    'original_shape': raw_data.shape,
    'processed_shape': X_transformed.shape,
    'pipeline_steps': [name for name, _ in complete_pipeline.steps],
    'config': config,
    'numerical_features': numerical_features,
    'categorical_features': categorical_features,
    'feature_count': X_transformed.shape[1],
    'requirements_satisfied': {
        'sklearn_pipeline': True,
        'aggregate_features': True,
        'temporal_features': True,
        'categorical_encoding': True,
        'missing_value_handling': True,
        'feature_scaling': True
    }
}

metadata_path = 'data/processed/task3_metadata.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)
print(f"‚úì Saved metadata to '{metadata_path}'")

# Save feature names
feature_names_path = 'data/processed/task3_feature_names.txt'
with open(feature_names_path, 'w') as f:
    for name in feature_names:
        f.write(name + '\n')
print(f"‚úì Saved feature names to '{feature_names_path}'")

# -----------------------------
# 10. Summary Report
# -----------------------------
print("\n" + "=" * 80)
print("TASK 3 COMPLETE - SUMMARY REPORT")
print("=" * 80)

print(f"\n‚úÖ ALL TASK REQUIREMENTS SATISFIED:")
requirements = [
    ("Use sklearn.pipeline.Pipeline", "‚úì"),
    ("Create Aggregate Features", "‚úì"),
    ("Extract Temporal Features", "‚úì"),
    ("Encode Categorical Variables", "‚úì"),
    ("Handle Missing Values", "‚úì"),
    ("Normalize/Standardize Features", "‚úì"),
    ("Feature Engineering with RFMS", "‚úì")
]

for req, status in requirements:
    print(f"   {req:40} {status}")

print(f"\nüìä DATA TRANSFORMATION PIPELINE:")
print(f"   1. Raw Transactions: {raw_data.shape[0]:,} rows √ó {raw_data.shape[1]} cols")
print(f"   2. Feature Extraction: Added temporal features")
print(f"   3. Customer Aggregation: {customer_data.shape[0]:,} customers")
print(f"   4. Feature Processing: {len(numerical_features)} numerical + {len(categorical_features)} categorical")
print(f"   5. Final Output: {X_transformed.shape[0]:,} samples √ó {X_transformed.shape[1]} features")

print(f"\nüéØ RFMS FEATURES CREATED:")
rfms_features = {
    'Recency (R)': 'recency_days',
    'Frequency (F)': ['transaction_count', 'frequency_per_day'],
    'Monetary (M)': ['total_transaction_amount', 'avg_transaction_amount'],
    'Standard Deviation (S)': 'std_transaction_amount'
}

for category, features in rfms_features.items():
    if isinstance(features, list):
        print(f"   {category}: {', '.join(features)}")
    else:
        print(f"   {category}: {features}")

print(f"\nüîß SKLEARN PIPELINE COMPONENTS:")
print(f"   ‚Ä¢ DateTimeFeatureExtractor - Custom transformer")
print(f"   ‚Ä¢ RFMFeatureAggregator - Custom transformer")
print(f"   ‚Ä¢ ColumnTransformer - sklearn component")
print(f"     ‚îú‚îÄ‚îÄ Numerical: SimpleImputer ‚Üí StandardScaler")
print(f"     ‚îî‚îÄ‚îÄ Categorical: SimpleImputer ‚Üí OneHotEncoder")

print(f"\nüíæ OUTPUT FILES:")
output_files = [
    ('data/processed/task3_features_final.csv', 'Processed feature matrix'),
    ('data/processed/task3_customer_features.csv', 'Customer-level RFM features'),
    ('models/task3_feature_pipeline.pkl', 'Complete sklearn pipeline'),
    ('models/task3_preprocessor.pkl', 'Preprocessor for inference'),
    ('data/processed/task3_metadata.json', 'Processing metadata'),
    ('data/processed/task3_feature_names.txt', 'Feature names list')
]

for path, description in output_files:
    if os.path.exists(path):
        print(f"   ‚úì {path}")
        print(f"     {description}")

print(f"\nüìà SAMPLE OF PROCESSED DATA:")
print(f"First 3 customers, first 8 features:")
sample_data = processed_df.drop(columns=['CustomerId']).iloc[:3, :8]
print(sample_data.to_string())

print(f"\nüîç KEY STATISTICS:")
print(f"   ‚Ä¢ Total customers processed: {customer_data.shape[0]:,}")
print(f"   ‚Ä¢ High-value features created: {len(numerical_features)}")
print(f"   ‚Ä¢ Categorical features encoded: {len(categorical_features)}")
print(f"   ‚Ä¢ One-hot encoded categories: {X_transformed.shape[1] - len(numerical_features)}")

print(f"\n" + "=" * 80)
print("READY FOR TASK 4 - PROXY TARGET VARIABLE ENGINEERING")
print("=" * 80)
print(f"\nNext Steps:")
print(f"   1. Use 'task3_customer_features.csv' for RFM clustering in Task 4")
print(f"   2. Use 'task3_features_final.csv' for model training in Task 5")
print(f"   3. Use 'task3_feature_pipeline.pkl' to process new data")
print(f"\nTo verify pipeline in Python:")
print(f"   from sklearn.pipeline import Pipeline")
print(f"   import joblib")
print(f"   pipeline = joblib.load('models/task3_feature_pipeline.pkl')")
print(f"   print(isinstance(pipeline, Pipeline))  # Should return True")

TASK 3 - FEATURE ENGINEERING WITH SKLEARN PIPELINE
Credit Risk Model using Alternative Data

1. CONFIGURING PIPELINE
----------------------------------------
Pipeline Configuration:
  imputation_strategy : median
  scaling_strategy    : standard
  encoding_strategy   : onehot
  handle_missing      : impute

2. LOADING DATA
----------------------------------------
Loading data from ../data/raw/data.csv...
‚úì Loaded 20,000 transactions

Data Statistics:
  Shape: (20000, 16)
  Columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']

3. BUILDING AND EXECUTING PIPELINE
----------------------------------------
Step 1: Execute first stage (cleaning and aggregation)...
‚úì Customer data shape after aggregation: (1184, 16)
‚úì Sample columns: ['CustomerId', 'total_transaction_amount', 'avg_transaction_a

In [5]:
# Example of proper pipeline structure
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])