### Amex Challenge

In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import os

warnings.filterwarnings('ignore')
os.getcwd()

'/Users/krishuagarwal/Desktop/Programming/python/d2l-learnings'

In [None]:
data_dir = os.path.join(os.getcwd(), "amex/round-2")
test_data = pd.read_parquet(os.path.join(data_dir, "test_data.parquet"))
train_data = pd.read_parquet(os.path.join(data_dir, "train_data.parquet"))
offer_metadata = pd.read_parquet(os.path.join(data_dir, "offer_metadata.parquet"))
add_eventdata = pd.read_parquet(os.path.join(data_dir, "add_event.parquet"))
add_transdata = pd.read_parquet(os.path.join(data_dir, "add_trans.parquet"))


In [None]:

class ClickPredictionCatBoost:
    def __init__(self, data_path=None, train_data=None):
        """
        Initialize the CatBoost Click Prediction model
        
        Args:
            data_path: Path to CSV file (for large datasets)
            train_ DataFrame (for datasets that fit in memory)
        """
        self.data_path = data_path
        self.train_data = train_data
        self.model = None
        self.feature_columns = None
        self.categorical_features = None
        self.feature_importance = None
        
    def prepare_data(self):
        """Prepare features and identify categorical columns"""
        
        # Define columns to exclude (IDs and label)
        exclude_cols = ['id1', 'id2', 'id3', 'id4', 'id5', 'y']
        
        self.feature_columns = [col for col in self.train_data.columns if col not in exclude_cols]
        
        # Categorical features based on your data dictionary
        categorical_cols = [
            'f42',   # Membership level
            'f48',   # Total DL Flights
            'f50',   # Account Creation Indicator
            'f52',   # Active Part y1
            'f53',   # Value assigned to member
            'f54',   # Honors enrollees
            'f55',   # HG Vacation Club
            'f56',   # H Code Tier
            'f57',   # H Promus Indicator
            'f349',  # Day of week
            'f354'   # Days since launch
        ]
        
        # One-hot encoded features (f226-f309) are already numerical
        # Map categorical feature names to their indices in feature set
        self.categorical_features = [
            self.feature_columns.index(col) for col in categorical_cols 
            if col in self.feature_columns
        ]
        
        print(f"Total features: {len(self.feature_columns)}")
        print(f"Categorical features: {len(self.categorical_features)}")
        
    def create_validation_split(self, method='random', validation_size=0.2, random_state=42):
        """
        Create validation split for large datasets
        
        Args:
            method: 'random', 'time_based', 'user_based', or 'sample'
            validation_size: Proportion for validation
            random_state: Random seed
        """
        
        if method == 'random' and self.train_data is not None:
            # Standard random split for smaller datasets
            X = self.train_data[self.feature_columns]
            y = self.train_data['y']
            
            return train_test_split(
                X, y, test_size=validation_size, 
                random_state=random_state, stratify=y
            )
            
        elif method == 'sample':
            # Sample-based validation for very large datasets
            if self.train_data is not None:
                sample_data = self.train_data.sample(frac=0.1, random_state=random_state)
            else:
                # Read sample from file
                total_lines = sum(1 for line in open(self.data_path)) - 1  # Exclude header
                sample_size = int(total_lines * 0.1)
                sample_data = pd.read_csv(self.data_path, nrows=sample_size)
            
            X = sample_data[self.feature_columns]
            y = sample_data['y']
            
            return train_test_split(
                X, y, test_size=validation_size, 
                random_state=random_state, stratify=y
            )
            
        elif method == 'time_based':
            # Time-based split using id5 (Event Date)
            if self.train_data is not None:
                data_sorted = self.train_data.sort_values('id5')
            else:
                data_sorted = pd.read_csv(self.data_path).sort_values('id5')
            
            split_idx = int(len(data_sorted) * (1 - validation_size))
            
            train_data = data_sorted.iloc[:split_idx]
            val_data = data_sorted.iloc[split_idx:]
            
            X_train = train_data[self.feature_columns]
            y_train = train_data['y']
            X_val = val_data[self.feature_columns]
            y_val = val_data['y']
            
            return X_train, X_val, y_train, y_val
            
        elif method == 'user_based':
            # User-based split using id2 (Customer ID)
            if self.train_data is not None:
                unique_users = self.train_data['id2'].unique()
            else:
                unique_users = pd.read_csv(self.data_path, usecols=['id2'])['id2'].unique()
            
            val_users = np.random.choice(
                unique_users, size=int(len(unique_users) * validation_size), 
                replace=False
            )
            
            if self.train_data is not None:
                train_data = self.train_data[~self.train_data['id2'].isin(val_users)]
                val_data = self.train_data[self.train_data['id2'].isin(val_users)]
            else:
                full_data = pd.read_csv(self.data_path)
                train_data = full_data[~full_data['id2'].isin(val_users)]
                val_data = full_data[full_data['id2'].isin(val_users)]
            
            X_train = train_data[self.feature_columns]
            y_train = train_data['y']
            X_val = val_data[self.feature_columns]
            y_val = val_data['y']
            
            return X_train, X_val, y_train, y_val
    
    def train_model(self, validation_method='sample', hyperparameters=None):
        """
        Train CatBoost model with specified validation method
        
        Args:
            validation_method: 'random', 'time_based', 'user_based', or 'sample'
            hyperparameters: Dict of CatBoost hyperparameters
        """
        
        # Prepare data
        self.prepare_data()
        
        # Default hyperparameters
        if hyperparameters is None:
            hyperparameters = {
                'iterations': 1000,
                'learning_rate': 0.1,
                'depth': 6,
                'eval_metric': 'AUC',
                'random_seed': 42,
                'early_stopping_rounds': 50,
                'verbose': 100,
                'cat_features': self.categorical_features,
                'auto_class_weights': 'Balanced'  # Handle class imbalance
            }
        
        # Create validation split
        print(f"Creating validation split using method: {validation_method}")
        X_train, X_val, y_train, y_val = self.create_validation_split(method=validation_method)
        
        print(f"Training set size: {len(X_train)}")
        print(f"Validation set size: {len(X_val)}")
        print(f"Positive class ratio - Train: {y_train.mean():.4f}, Val: {y_val.mean():.4f}")
        
        # Initialize CatBoost model
        self.model = CatBoostClassifier(**hyperparameters)
        
        # Train the model
        print("Training CatBoost model...")
        self.model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            use_best_model=True
        )
        
        # Evaluate model
        train_pred = self.model.predict_proba(X_train)[:, 1]
        val_pred = self.model.predict_proba(X_val)[:, 1]
        
        train_auc = roc_auc_score(y_train, train_pred)
        val_auc = roc_auc_score(y_val, val_pred)
        
        train_ap = average_precision_score(y_train, train_pred)
        val_ap = average_precision_score(y_val, val_pred)
        
        print(f"\nModel Performance:")
        print(f"Training AUC: {train_auc:.4f}")
        print(f"Validation AUC: {val_auc:.4f}")
        print(f"Training AP: {train_ap:.4f}")
        print(f"Validation AP: {val_ap:.4f}")
        
        return {
            'train_auc': train_auc,
            'val_auc': val_auc,
            'train_ap': train_ap,
            'val_ap': val_ap
        }
    
    def train_with_file_pools(self, train_file, val_file=None, hyperparameters=None):
        """
        Train using CatBoost Pool for very large datasets
        
        Args:
            train_file: Path to training CSV file
            val_file: Path to validation CSV file (optional)
            hyperparameters: Dict of CatBoost hyperparameters
        """
        
        self.prepare_data()
        
        # Default hyperparameters
        if hyperparameters is None:
            hyperparameters = {
                'iterations': 1000,
                'learning_rate': 0.1,
                'depth': 6,
                'eval_metric': 'AUC',
                'random_seed': 42,
                'early_stopping_rounds': 50,
                'verbose': 100,
                'auto_class_weights': 'Balanced'
            }
        
        # Create training pool
        train_pool = Pool(
            data=train_file,
            delimiter=',',
            has_header=True,
            label_column='y',
            cat_features=self.categorical_features
        )
        
        # Create validation pool if provided
        eval_set = None
        if val_file:
            val_pool = Pool(
                data=val_file,
                delimiter=',',
                has_header=True,
                label_column='y',
                cat_features=self.categorical_features
            )
            eval_set = val_pool
        
        # Initialize and train model
        self.model = CatBoostClassifier(**hyperparameters)
        
        print("Training CatBoost model with file pools...")
        self.model.fit(train_pool, eval_set=eval_set)
        
        print("Model training completed!")
    
    def get_feature_importance(self, plot=True, top_n=20):
        """Get and plot feature importance"""
        
        if self.model is None:
            raise ValueError("Model must be trained first")
        
        # Get feature importance
        importance = self.model.get_feature_importance()
        
        # Create DataFrame for easier handling
        feature_importance_df = pd.DataFrame({
            'feature': self.feature_columns,
            'importance': importance
        }).sort_values('importance', ascending=False)
        
        self.feature_importance = feature_importance_df
        
        if plot:
            plt.figure(figsize=(10, 8))
            top_features = feature_importance_df.head(top_n)
            
            plt.barh(range(len(top_features)), top_features['importance'])
            plt.yticks(range(len(top_features)), top_features['feature'])
            plt.xlabel('Feature Importance')
            plt.title(f'Top {top_n} Feature Importances')
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.show()
        
        return feature_importance_df
    
    def predict_click_probability(self, data):
        """Predict click probabilities for new data"""
        
        if self.model is None:
            raise ValueError("Model must be trained first")
        
        # Ensure data has the same features
        X = data[self.feature_columns]
        
        # Predict probabilities
        probabilities = self.model.predict_proba(X)[:, 1]
        
        return probabilities
    
    def rank_offers_for_user_day(self, data, user_col='id2', day_col='id5', offer_col='id3'):
        """
        Rank offers for each user-day combination
        
        Args:
             DataFrame with user-day-offer combinations
            user_col: Column name for user ID
            day_col: Column name for day
            offer_col: Column name for offer ID
        """
        
        if self.model is None:
            raise ValueError("Model must be trained first")
        
        # Predict probabilities
        data['click_probability'] = self.predict_click_probability(data)
        
        # Rank offers for each user-day
        ranked_offers = data.groupby([user_col, day_col]).apply(
            lambda x: x.sort_values('click_probability', ascending=False).reset_index(drop=True)
        ).reset_index(drop=True)
        
        # Add rank within each user-day group
        ranked_offers['rank'] = ranked_offers.groupby([user_col, day_col]).cumcount() + 1
        
        return ranked_offers
    
    def save_model(self, filepath):
        """Save trained model"""
        if self.model is None:
            raise ValueError("Model must be trained first")
        
        self.model.save_model(filepath)
        print(f"Model saved to {filepath}")
    
    def load_model(self, filepath):
        """Load trained model"""
        self.model = CatBoostClassifier()
        self.model.load_model(filepath)
        print(f"Model loaded from {filepath}")

# Example usage functions
def example_usage_with_dataframe(train_data):
    """Example usage when data fits in memory"""
    
    # Initialize model
    model = ClickPredictionCatBoost(train_data=train_data)
    
    # Train with sample-based validation (for large datasets)
    results = model.train_model(validation_method='sample')
    
    # Get feature importance
    importance_df = model.get_feature_importance(top_n=20)
    
    # Example prediction on new data
    # predictions = model.predict_click_probability(new_data)
    
    return model, results, importance_df

def example_usage_with_files(train_file, val_file=None):
    """Example usage for very large datasets using files"""
    
    # Initialize model
    model = ClickPredictionCatBoost(data_path=train_file)
    
    # Train with file pools
    model.train_with_file_pools(train_file, val_file)
    
    # Get feature importance
    importance_df = model.get_feature_importance(top_n=20)
    
    return model, importance_df

def hyperparameter_tuning_example():
    """Example hyperparameter configurations"""
    
    # Conservative (faster training)
    conservative_params = {
        'iterations': 500,
        'learning_rate': 0.15,
        'depth': 4,
        'eval_metric': 'AUC',
        'random_seed': 42,
        'early_stopping_rounds': 30,
        'verbose': 50,
        'auto_class_weights': 'Balanced'
    }
    
    # Aggressive (better performance)
    aggressive_params = {
        'iterations': 2000,
        'learning_rate': 0.05,
        'depth': 8,
        'eval_metric': 'AUC',
        'random_seed': 42,
        'early_stopping_rounds': 100,
        'verbose': 100,
        'auto_class_weights': 'Balanced',
        'subsample': 0.8,
        'colsample_bylevel': 0.8
    }
    
    # Ranking-focused
    ranking_params = {
        'iterations': 1500,
        'learning_rate': 0.08,
        'depth': 6,
        'eval_metric': 'NDCG',
        'random_seed': 42,
        'early_stopping_rounds': 75,
        'verbose': 100,
        'auto_class_weights': 'Balanced',
        'objective': 'Logloss'
    }
    
    return conservative_params, aggressive_params, ranking_params

# Main execution example
if __name__ == "__main__":
    # Example 1: Using DataFrame (assuming train_data is your DataFrame)
    print("Training with DataFrame...")
    model, results, importance = example_usage_with_dataframe(train_data)
    
    # Example 2: Using files for very large datasets
    # print("Training with files...")
    # model, importance = example_usage_with_files("train.csv", "val.csv")
    
    # Example 3: Custom hyperparameters
    # conservative, aggressive, ranking = hyperparameter_tuning_example()
    # model.train_model(validation_method='sample', hyperparameters=aggressive)
    
    # Save model
    model.save_model("click_prediction_model.cbm")
    
    print("Training completed successfully!")


In [27]:
print("Training with DataFrame...")
model, results, importance = example_usage_with_dataframe(train_data)
conservative, aggressive, ranking = hyperparameter_tuning_example()
model.train_model(validation_method='sample', hyperparameters=ranking)
model.save_model("click_prediction_model.cbm")
print("Training completed successfully!")

Training with DataFrame...
Total features: 366
Categorical features: 11


KeyError: 'y'

In [26]:
test_data.head()

Unnamed: 0,id1,id2,id3,id4,id5,f1,f2,f3,f4,f5,...,f357,f358,f359,f360,f361,f362,f363,f364,f365,f366
46756,1362907_91950_16-23_2023-11-04 18:56:26.000794,1362907,91950,2023-11-04 18:56:26.000794,2023-11-04,,,,,,...,0.0027807272790246,0.0465999838670646,0.0,,1.0,0.0,0.0,56.0,0.0,0.0
57819,1082599_88356_16-23_2023-11-04 06:08:53.373,1082599,88356,2023-11-04 06:08:53.373,2023-11-04,,9.0,,,,...,0.0014287946777398,0.0603093514970909,0.0,,195.0,13.0,0.0666666666666666,,,
15390,1888466_958700_16-23_2023-11-05 10:07:28.000725,1888466,958700,2023-11-05 10:07:28.000725,2023-11-05,,,,,22.0,...,-0.0174961119751166,0.0734836702954899,0.0,,155.0,67.0,0.432258064516129,1142.0,436.0,0.3817863397548161
145730,1888971_795739_16-23_2023-11-04 12:25:28.244,1888971,795739,2023-11-04 12:25:28.244,2023-11-04,,,,,,...,0.0013159476327096,0.040572039549215,0.0,,,,,,,
146085,1256369_82296_16-23_2023-11-05 06:45:26.657,1256369,82296,2023-11-05 06:45:26.657,2023-11-05,,,,,,...,0.0020541699799278,0.038243539079214,0.0,,29.0,2.0,0.0689655172413793,361.0,3.0,0.0083102493074792
