In [None]:
"""
Offer Click Prediction Pipeline
Competition: Unstop ML Challenge
Author: Your Name
Description: End-to-end deep learning pipeline for predicting customer click-through rates on promotional offers
"""

import os
from pathlib import Path
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
import warnings

warnings.filterwarnings('ignore')



In [None]:
# Configuration

class Config:
    """Centralized configuration for the pipeline"""
    # Paths
    DATA_DIR = Path('data')
    MODEL_DIR = Path('models')
    OUTPUT_DIR = Path('outputs')

    # Training parameters
    BATCH_SIZE = 1024
    EPOCHS = 5
    LEARNING_RATE = 0.001
    VALIDATION_SPLIT = 0.2
    RANDOM_SEED = 42

    # Quick demo mode (for testing/recruiters)
    DEMO_MODE = False  # Set to True to run on 10% of data with 2 epochs
    DEMO_SAMPLE_SIZE = 0.1
    DEMO_EPOCHS = 2

    # Model parameters
    DROPOUT_RATE = 0.3
    EMBEDDING_DIM_MAX = 50

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    @classmethod
    def create_dirs(cls):
        """Create necessary directories if they don't exist"""
        cls.MODEL_DIR.mkdir(exist_ok=True)
        cls.OUTPUT_DIR.mkdir(exist_ok=True)


In [None]:
 #Data Loading and Preprocessing

def load_data(data_dir: Path) -> tuple:
    """
    Load all required datasets from parquet files

    Args:
        data_dir: Path to data directory

    Returns:
        Tuple of dataframes (train, test, offer_meta, add_event, add_trans)
    """
    print("Loading datasets...")

    files = {
        'train': 'train.parquet',
        'test': 'test.parquet',
        'offer_meta': 'offer_metadata.parquet',
        'add_event': 'additional_event.parquet',
        'add_trans': 'additional_transaction.parquet'
    }

    # Check if all files exist
    for name, filename in files.items():
        filepath = data_dir / filename
        if not filepath.exists():
            raise FileNotFoundError(f"{filename} not found in {data_dir}")

    # Load datasets
    train = pd.read_parquet(data_dir / files['train'])
    test = pd.read_parquet(data_dir / files['test'])
    offer_meta = pd.read_parquet(data_dir / files['offer_meta'])
    add_event = pd.read_parquet(data_dir / files['add_event'])
    add_trans = pd.read_parquet(data_dir / files['add_trans'])

    print(f"✓ Train shape: {train.shape}, Test shape: {test.shape}")

    # Parse dates
    train['event_time'] = pd.to_datetime(train['id4'])
    test['event_time'] = pd.to_datetime(test['id4'])
    offer_meta['offer_start'] = pd.to_datetime(offer_meta['id12'])
    offer_meta['offer_end'] = pd.to_datetime(offer_meta['id13'])
    add_event['event_time'] = pd.to_datetime(add_event['id4'])
    add_trans['trans_time'] = pd.to_datetime(add_trans['f370'])

    return train, test, offer_meta, add_event, add_trans

def reduce_memory_usage(df: pd.DataFrame) -> pd.DataFrame:
    """Reduce memory usage by downcasting numeric types"""
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        elif df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')

    end_mem = df.memory_usage().sum() / 1024**2
    print(f"✓ Memory reduced: {start_mem:.2f}MB → {end_mem:.2f}MB "
          f"({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)")

    return df

In [None]:
# Feature Engineering


def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """Create time-based features from offer and event data"""
    # Offer validity features
    df['offer_active_days'] = (df['offer_end'] - df['offer_start']).dt.days
    df['days_since_offer_start'] = (df['event_time'] - df['offer_start']).dt.days.clip(lower=0)
    df['days_until_offer_end'] = (df['offer_end'] - df['event_time']).dt.days.clip(lower=0)
    df['offer_expired'] = (df['event_time'] > df['offer_end']).astype(int)

    # Time features
    df['event_hour'] = df['event_time'].dt.hour
    df['event_dayofweek'] = df['event_time'].dt.dayofweek
    df['is_weekend'] = (df['event_dayofweek'] >= 5).astype(int)

    return df

def create_aggregated_features(train: pd.DataFrame, test: pd.DataFrame,
                               add_event: pd.DataFrame, add_trans: pd.DataFrame) -> tuple:
    """Create aggregated customer-level features"""
    print("Creating aggregated features...")

    # Event aggregations
    event_agg = add_event.groupby('id2').agg(
        event_count=('id3', 'count'),
        unique_event_hours=('event_time', lambda x: x.dt.hour.nunique()),
        unique_event_days=('event_time', lambda x: x.dt.dayofweek.nunique())
    ).reset_index()

    # Transaction aggregations
    trans_agg = add_trans.groupby('id2').agg(
        trans_count=('f370', 'count'),
        unique_trans_days=('trans_time', lambda x: x.dt.dayofweek.nunique()),
        avg_trans_hour=('trans_time', lambda x: x.dt.hour.mean())
    ).reset_index()

    # Merge to train and test
    for df in [train, test]:
        df = df.merge(event_agg, on='id2', how='left')
        df = df.merge(trans_agg, on='id2', how='left')

        # Fill missing values
        agg_cols = ['event_count', 'unique_event_hours', 'unique_event_days',
                   'trans_count', 'unique_trans_days', 'avg_trans_hour']
        df[agg_cols] = df[agg_cols].fillna(0)

    return train, test

def feature_engineering(train: pd.DataFrame, test: pd.DataFrame,
                       offer_meta: pd.DataFrame, add_event: pd.DataFrame,
                       add_trans: pd.DataFrame) -> tuple:
    """Complete feature engineering pipeline"""
    print("\n=== Feature Engineering ===")

    # Merge offer metadata
    train = train.merge(
        offer_meta[['id3', 'discountpercent', 'offer_start', 'offer_end']],
        on='id3', how='left'
    )
    test = test.merge(
        offer_meta[['id3', 'discountpercent', 'offer_start', 'offer_end']],
        on='id3', how='left'
    )

    # Create temporal features
    train = create_temporal_features(train)
    test = create_temporal_features(test)

    # Create aggregated features
    train, test = create_aggregated_features(train, test, add_event, add_trans)

    # Memory optimization
    train = reduce_memory_usage(train)
    test = reduce_memory_usage(test)

    print("✓ Feature engineering complete\n")
    return train, test


In [None]:
# Encoding and Scaling

def encode_categorical_features(train: pd.DataFrame, test: pd.DataFrame,
                                categorical_cols: list) -> tuple:
    """Encode categorical features using Label Encoding"""
    print(f"Encoding {len(categorical_cols)} categorical features...")

    encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        combined = pd.concat([
            train[col].astype(str),
            test[col].astype(str)
        ])
        le.fit(combined)

        train[col] = le.transform(train[col].astype(str))
        test[col] = le.transform(test[col].astype(str))
        encoders[col] = le

    print(f"✓ Encoding complete\n")
    return train, test, encoders

def scale_numerical_features(train: pd.DataFrame, test: pd.DataFrame,
                             numerical_cols: list) -> tuple:
    """Standardize numerical features"""
    print(f"Scaling {len(numerical_cols)} numerical features...")

    scaler = StandardScaler()
    train[numerical_cols] = scaler.fit_transform(train[numerical_cols])
    test[numerical_cols] = scaler.transform(test[numerical_cols])

    print(f"✓ Scaling complete\n")
    return train, test, scaler


In [None]:
# PyTorch Dataset

class ClickDataset(Dataset):
    """Custom Dataset for click prediction data"""

    def __init__(self, df: pd.DataFrame, categorical_cols: list,
                 numerical_cols: list, target: pd.Series = None):
        self.cats = df[categorical_cols].values.astype(np.int64)
        self.nums = df[numerical_cols].values.astype(np.float32)
        self.y = target.values.astype(np.float32) if target is not None else None

    def __len__(self):
        return len(self.cats)

    def __getitem__(self, idx):
        if self.y is not None:
            return (self.cats[idx], self.nums[idx], self.y[idx])
        return (self.cats[idx], self.nums[idx])


In [None]:
# Neural Network Model

class ClickPredictionNN(nn.Module):
    """Deep Neural Network with embedding layers for categorical features"""

    def __init__(self, embedding_sizes: list, n_continuous: int,
                 dropout_rate: float = 0.3):
        super().__init__()

        # Embedding layers for categorical features
        self.embeddings = nn.ModuleList([
            nn.Embedding(categories, size)
            for categories, size in embedding_sizes
        ])

        emb_dim_sum = sum([size for _, size in embedding_sizes])

        # Batch normalization
        self.batchnorm_cat = nn.BatchNorm1d(emb_dim_sum)
        self.batchnorm_cont = nn.BatchNorm1d(n_continuous)

        # Fully connected layers
        self.fc1 = nn.Linear(emb_dim_sum + n_continuous, 256)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, cat_data, cont_data):
        # Process embeddings
        embeddings = [emb(cat_data[:, i]) for i, emb in enumerate(self.embeddings)]
        x = torch.cat(embeddings, 1)
        x = self.batchnorm_cat(x)

        # Normalize continuous features
        cont_data = self.batchnorm_cont(cont_data)

        # Concatenate all features
        x = torch.cat([x, cont_data], 1)

        # Forward pass
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        output = torch.sigmoid(self.fc3(x))

        return output.squeeze()


In [None]:
# Training

def train_model(model: nn.Module, train_loader: DataLoader,
                valid_loader: DataLoader, config: Config) -> nn.Module:
    """Train the neural network"""
    print(f"=== Training on {config.DEVICE} ===")

    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
    model.to(config.DEVICE)

    best_ap = 0

    for epoch in range(config.EPOCHS):
        # Training phase
        model.train()
        running_loss = 0

        for cat, cont, target in train_loader:
            cat = cat.to(config.DEVICE)
            cont = cont.to(config.DEVICE)
            target = target.to(config.DEVICE)

            optimizer.zero_grad()
            preds = model(cat, cont)
            loss = criterion(preds, target)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * target.size(0)

        # Validation phase
        model.eval()
        y_true, y_pred = [], []

        with torch.no_grad():
            for cat, cont, target in valid_loader:
                cat = cat.to(config.DEVICE)
                cont = cont.to(config.DEVICE)
                target = target.to(config.DEVICE)

                preds = model(cat, cont)
                y_true.extend(target.cpu().numpy())
                y_pred.extend(preds.cpu().numpy())

        # Calculate metrics
        train_loss = running_loss / len(train_loader.dataset)
        avg_precision = average_precision_score(y_true, y_pred)

        print(f"Epoch {epoch+1}/{config.EPOCHS} - Loss: {train_loss:.4f} - AP: {avg_precision:.4f}")

        # Save best model
        if avg_precision > best_ap:
            best_ap = avg_precision
            model_path = config.MODEL_DIR / 'best_model.pth'
            torch.save(model.state_dict(), model_path)
            print(f"  ✓ Saved best model (AP: {best_ap:.4f})")

    print(f"\n✓ Training complete! Best AP: {best_ap:.4f}\n")
    return model


In [None]:
# Prediction

def predict(model: nn.Module, test_loader: DataLoader, device: torch.device) -> np.ndarray:
    """Generate predictions on test set"""
    print("Generating predictions...")

    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            if len(batch) == 3:  # Has target
                cat, cont, _ = batch
            else:  # No target
                cat, cont = batch

            cat = cat.to(device)
            cont = cont.to(device)
            preds = model(cat, cont)
            predictions.extend(preds.cpu().numpy())

    print("✓ Predictions complete\n")
    return np.array(predictions)


In [None]:
# Main Pipeline

def main():
    """Main execution pipeline"""
    print("\n" + "="*70)
    print("  OFFER CLICK PREDICTION PIPELINE")
    print("="*70 + "\n")

    # Setup
    Config.create_dirs()
    torch.manual_seed(Config.RANDOM_SEED)
    np.random.seed(Config.RANDOM_SEED)

    try:
        # Load data
        train, test, offer_meta, add_event, add_trans = load_data(Config.DATA_DIR)

        # Quick demo mode for testing
        if Config.DEMO_MODE:
            print(f"\n⚡ DEMO MODE: Using {Config.DEMO_SAMPLE_SIZE*100}% of data "
                  f"with {Config.DEMO_EPOCHS} epochs\n")
            train = train.sample(frac=Config.DEMO_SAMPLE_SIZE, random_state=Config.RANDOM_SEED)
            test = test.sample(frac=Config.DEMO_SAMPLE_SIZE, random_state=Config.RANDOM_SEED)

        # Feature engineering
        train, test = feature_engineering(train, test, offer_meta, add_event, add_trans)

        # Define feature columns
        categorical_cols = ['id2', 'id3', 'discountpercent', 'event_hour',
                          'event_dayofweek', 'is_weekend', 'offer_expired']

        exclude_cols = categorical_cols + ['y', 'id1', 'id4', 'event_time',
                                          'offer_start', 'offer_end']
        numerical_cols = [col for col in train.columns
                         if col not in exclude_cols and
                         train[col].dtype in [np.float32, np.float64, np.int32, np.int64]]

        print(f"Features: {len(categorical_cols)} categorical + {len(numerical_cols)} numerical\n")

        # Encode and scale
        train, test, encoders = encode_categorical_features(train, test, categorical_cols)
        train, test, scaler = scale_numerical_features(train, test, numerical_cols)

        # Split data
        X_train, X_val, y_train, y_val = train_test_split(
            train, train['y'],
            test_size=Config.VALIDATION_SPLIT,
            random_state=Config.RANDOM_SEED,
            stratify=train['y']
        )

        print(f"Data split: Train={len(X_train):,} | Validation={len(X_val):,}\n")

        # Create datasets
        train_ds = ClickDataset(X_train, categorical_cols, numerical_cols, y_train)
        val_ds = ClickDataset(X_val, categorical_cols, numerical_cols, y_val)
        test_ds = ClickDataset(test, categorical_cols, numerical_cols)

        # Create data loaders
        train_loader = DataLoader(train_ds, batch_size=Config.BATCH_SIZE, shuffle=True)
        val_loader = DataLoader(val_ds, batch_size=Config.BATCH_SIZE, shuffle=False)
        test_loader = DataLoader(test_ds, batch_size=Config.BATCH_SIZE, shuffle=False)

        # Calculate embedding sizes
        embedding_sizes = []
        for col in categorical_cols:
            n_unique = len(encoders[col].classes_)
            emb_dim = min(Config.EMBEDDING_DIM_MAX, (n_unique + 1) // 2)
            embedding_sizes.append((n_unique, emb_dim))

        # Initialize and train model
        model = ClickPredictionNN(embedding_sizes, len(numerical_cols), Config.DROPOUT_RATE)
        n_params = sum(p.numel() for p in model.parameters())
        print(f"Model: {n_params:,} parameters\n")

        # Use demo epochs if in demo mode
        train_config = Config()
        if Config.DEMO_MODE:
            train_config.EPOCHS = Config.DEMO_EPOCHS

        model = train_model(model, train_loader, val_loader, train_config)

        # Generate predictions
        preds = predict(model, test_loader, Config.DEVICE)
        test['pred'] = preds

        # Rank offers per customer (top 7)
        print("Ranking top-7 offers per customer...")
        test = test.sort_values(['id2', 'pred'], ascending=[True, False])
        test['rank'] = test.groupby('id2')['pred'].rank(method='first', ascending=False)

        # Create submission
        submission = test[test['rank'] <= 7][['id1', 'id2', 'id3', 'id5', 'pred', 'rank']]
        submission_path = Config.OUTPUT_DIR / 'submission.csv'
        submission.to_csv(submission_path, index=False)

        print(f"✓ Submission saved: {submission_path}")
        print(f"✓ Submission shape: {submission.shape}")
        print("\n" + "="*70)
        print(" PIPELINE COMPLETED SUCCESSFULLY!")
        print("="*70 + "\n")

    except Exception as e:
        print(f"\n ERROR: {str(e)}\n")
        raise

if __name__ == "__main__":
    main()