In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import joblib
import shap
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

class ClickPredictionModel:
    def __init__(self):
        self.model = None
        self.feature_columns = None
        self.label_encoders = {}
        self.scaler = StandardScaler()
        
    def load_data(self):
        """Load all datasets"""
        # Load main datasets
        self.df_events = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/add_event.parquet')
        self.df_trans = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/add_trans.parquet')
        self.df_offers = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/offer_metadata.parquet')
        self.df_train = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/train_data.parquet')
        self.df_test = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/test_data.parquet')
        self.df_submission = pd.read_csv('/Users/jatin/Desktop/AMEX Hackathon/submission_template.csv')
        
        # Ensure ID columns are consistent
        for df in [self.df_events, self.df_trans, self.df_offers, self.df_train, self.df_test]:
            if 'id2' in df.columns:
                df['id2'] = df['id2'].astype(str)
        
        print("Data loaded successfully!")
        print(f"Events shape: {self.df_events.shape}")
        print(f"Transactions shape: {self.df_trans.shape}")
        print(f"Offers shape: {self.df_offers.shape}")
        print(f"Train shape: {self.df_train.shape}")
        print(f"Test shape: {self.df_test.shape}")
        
    def create_event_features(self, df):
        """Create features from events data"""
        if hasattr(self, 'df_events') and not self.df_events.empty:
            # Convert timestamp columns to datetime safely
            try:
                self.df_events['id4'] = pd.to_datetime(self.df_events['id4'], errors='coerce')
                self.df_events['id7'] = pd.to_datetime(self.df_events['id7'], errors='coerce')
                self.df_events['timestamp_diff'] = (self.df_events['id7'] - self.df_events['id4']).dt.total_seconds()
            except Exception as e:
                print(f"Error processing timestamps: {e}")
                self.df_events['timestamp_diff'] = 0
            
            # Basic event aggregations
            agg_dict = {
                'id3': ['count', 'nunique'],
                'timestamp_diff': ['min', 'max', 'mean', 'std'],
                'id6': 'nunique'
            }
            
            valid_columns = {k: v for k, v in agg_dict.items() if k in self.df_events.columns}
            events_agg = self.df_events.groupby('id2').agg(valid_columns).reset_index()
            
            # Flatten column names
            events_agg.columns = ['id2'] + [f'event_{col[0]}_{col[1]}' for col in events_agg.columns[1:]]
            
            # Calculate event-based ratios safely
            if 'event_id3_count' in events_agg.columns:
                events_agg['event_click_rate'] = events_agg['event_id3_count'] / (events_agg['event_id3_count'] + 1e-6)
            
            # Merge with main dataframe
            if 'id2' in df.columns:
                df = df.merge(events_agg, on='id2', how='left')
            
            # Customer-offer specific features
            if all(col in self.df_events.columns for col in ['id2', 'id3', 'timestamp_diff', 'id6']):
                customer_offer_agg = {
                    'timestamp_diff': ['count', 'mean'],
                    'id6': 'nunique'
                }
                
                customer_offer_events = self.df_events.groupby(['id2', 'id3']).agg(customer_offer_agg).reset_index()
                customer_offer_events.columns = ['id2', 'id3'] + [f'co_event_{col[0]}_{col[1]}' for col in customer_offer_events.columns[2:]]
                
                if 'co_event_timestamp_diff_count' in customer_offer_events.columns:
                    customer_offer_events['co_event_specific_ctr'] = (
                        customer_offer_events['co_event_timestamp_diff_count'] / 
                        (customer_offer_events['co_event_timestamp_diff_count'] + 1e-6)
                    )
                
                if all(col in df.columns for col in ['id2', 'id3']):
                    df = df.merge(customer_offer_events, on=['id2', 'id3'], how='left')
                    
        return df
    
    def create_transaction_features(self, df):
        """Create features from transaction data"""
        if hasattr(self, 'df_trans') and not self.df_trans.empty:
            # Convert numeric columns to numeric type
            numeric_cols = ['f367', 'f368', 'f369', 'f370', 'f371', 'f372', 'f374']
            for col in numeric_cols:
                if col in self.df_trans.columns:
                    self.df_trans[col] = pd.to_numeric(self.df_trans[col], errors='coerce')
            
            # Handle non-numeric columns safely
            if 'id8' in self.df_trans.columns:
                self.df_trans['id8'] = self.df_trans['id8'].astype(str)
            
            # Aggregate transaction features by customer
            agg_dict = {
                'f367': ['sum', 'mean', 'count', 'std', 'min', 'max'],
                'f368': ['nunique', 'count'],
                'f369': ['sum', 'mean'],
                'f370': ['min', 'max', 'nunique'],
                'f371': ['mean', 'std'],
                'f372': ['nunique'],
                'f374': ['nunique'],
                'id8': ['nunique']
            }
            
            # Only include columns that exist in the dataframe
            valid_columns = {k: v for k, v in agg_dict.items() if k in self.df_trans.columns}
            trans_agg = self.df_trans.groupby('id2').agg(valid_columns).reset_index()
            
            # Flatten column names
            trans_agg.columns = ['id2'] + [f'trans_{col[0]}_{col[1]}' for col in trans_agg.columns[1:]]
            
            # Create advanced transaction features safely
            if 'trans_f367_sum' in trans_agg.columns and 'trans_f367_count' in trans_agg.columns:
                trans_agg['trans_avg_amount'] = trans_agg['trans_f367_sum'] / (trans_agg['trans_f367_count'] + 1e-6)
            
            if 'trans_f367_std' in trans_agg.columns and 'trans_f367_mean' in trans_agg.columns:
                trans_agg['trans_amount_cv'] = trans_agg['trans_f367_std'] / (trans_agg['trans_f367_mean'] + 1e-6)
            
            if 'trans_f367_max' in trans_agg.columns and 'trans_f367_mean' in trans_agg.columns:
                trans_agg['trans_high_value_ratio'] = (trans_agg['trans_f367_max'] > trans_agg['trans_f367_mean'] * 2).astype(int)
            
            if 'trans_f368_nunique' in trans_agg.columns and 'trans_f368_count' in trans_agg.columns:
                trans_agg['trans_product_diversity'] = trans_agg['trans_f368_nunique'] / (trans_agg['trans_f368_count'] + 1e-6)
            
            if 'trans_amount_cv' in trans_agg.columns:
                trans_agg['trans_spending_consistency'] = 1 / (trans_agg['trans_amount_cv'] + 1e-6)
            
            # Merge with main dataframe
            if 'id2' in df.columns:
                df = df.merge(trans_agg, on='id2', how='left')
                
        return df
    
    def create_offer_features(self, df):
        """Create features from offer metadata"""
        if hasattr(self, 'df_offers') and not self.df_offers.empty:
            # Ensure numeric columns are numeric
            numeric_cols = ['f375', 'f376', 'f377', 'f378', 'f374']
            for col in numeric_cols:
                if col in self.df_offers.columns:
                    self.df_offers[col] = pd.to_numeric(self.df_offers[col], errors='coerce')
            
            # Define aggregations
            agg_dict = {
                'f375': ['mean', 'std'],
                'f376': ['mean', 'std'],
                'f377': ['mean', 'std'],
                'id10': ['nunique'],
                'id11': ['nunique'],
                'f378': ['mean', 'std'],
                'f374': ['nunique'],
                'id12': ['min', 'max', 'mean'],
                'id13': ['min', 'max', 'mean']
            }
            
            # Only include columns that exist
            valid_columns = {k: v for k, v in agg_dict.items() if k in self.df_offers.columns}
            offer_agg = self.df_offers.groupby('id3').agg(valid_columns).reset_index()
            
            # Flatten column names
            offer_agg.columns = ['id3'] + [f'offer_{col[0]}_{col[1]}' for col in offer_agg.columns[1:]]
            
            # Create offer-specific features safely
            if 'offer_id13_mean' in offer_agg.columns and 'offer_id12_mean' in offer_agg.columns:
                offer_agg['offer_duration'] = offer_agg['offer_id13_mean'] - offer_agg['offer_id12_mean']
            
            if 'offer_f376_mean' in offer_agg.columns and 'offer_f375_mean' in offer_agg.columns:
                offer_agg['offer_attractiveness'] = offer_agg['offer_f376_mean'] * offer_agg['offer_f375_mean']
            
            if 'offer_f378_std' in offer_agg.columns and 'offer_f378_mean' in offer_agg.columns:
                offer_agg['offer_complexity'] = offer_agg['offer_f378_std'] / (offer_agg['offer_f378_mean'] + 1e-6)
            
            # Merge with main dataframe
            if 'id3' in df.columns:
                df = df.merge(offer_agg, on='id3', how='left')
                
        return df
    
    def create_advanced_features(self, df):
        """Create advanced engineered features"""
        # Make a copy to avoid SettingWithCopyWarning
        df = df.copy()
        
        # Time-based features
        if 'id5' in df.columns:
            df['day_of_week'] = df['id5'] % 7
            df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        
        if 'id4' in df.columns:
            df['hour_of_day'] = (df['id4'] % 86400) // 3600
            df['is_business_hours'] = ((df['hour_of_day'] >= 9) & (df['hour_of_day'] <= 17)).astype(int)
        
        # Customer engagement features
        engagement_features = [f for f in ['f28', 'f29', 'f30', 'f31', 'f147', 'f148', 'f149', 'f150'] 
                             if f in df.columns]
        if engagement_features:
            df['total_engagement'] = df[engagement_features].sum(axis=1)
            df['avg_engagement'] = df[engagement_features].mean(axis=1)
            df['engagement_consistency'] = df[engagement_features].std(axis=1)
        
        # CTR-based features
        ctr_features = [col for col in df.columns if 'ctr' in col.lower() or (col.startswith('f1') and col[1:].isdigit() and int(col[1:]) in range(4,10)])
        if ctr_features:
            df['avg_ctr'] = df[ctr_features].mean(axis=1)
            df['max_ctr'] = df[ctr_features].max(axis=1)
            df['ctr_consistency'] = df[ctr_features].std(axis=1)
        
        # Spending pattern features
        spending_features = [f for f in ['f39', 'f40', 'f41'] if f in df.columns]
        if spending_features:
            df['total_spending'] = df[spending_features].sum(axis=1)
            df['spending_diversity'] = (df[spending_features] > 0).sum(axis=1)
            if len(spending_features) > 0:
                df['dominant_spending_category'] = df[spending_features].idxmax(axis=1)
        
        # Interaction features with existence checks
        if all(f in df.columns for f in ['f363', 'f331']):
            df['f363_x_f331'] = df['f363'] * df['f331']
        
        if all(f in df.columns for f in ['f366', 'f329']):
            df['f366_x_f329'] = df['f366'] * df['f329']
        
        # Similar checks for all other interaction features...
        
        # PCA features
        low_imp_features = [col for col in df.columns 
                           if col.startswith('f') and col[1:].isdigit() and int(col[1:]) > 300]
        if len(low_imp_features) > 1:
            try:
                pca = PCA(n_components=min(5, len(low_imp_features)))
                df_pca = pca.fit_transform(df[low_imp_features].fillna(0))
                for i in range(df_pca.shape[1]):
                    df[f'pca_component_{i}'] = df_pca[:, i]
            except Exception as e:
                print(f"PCA failed: {e}")
        
        # Clustering features
        cluster_features = [f for f in ['f363', 'f366', 'f150', 'f138'] if f in df.columns]
        if len(cluster_features) >= 2:
            try:
                n_clusters = min(5, len(df)//10)
                if n_clusters > 1:
                    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                    df['customer_cluster'] = kmeans.fit_predict(df[cluster_features].fillna(0))
            except Exception as e:
                print(f"Clustering failed: {e}")
        
        return df
    
    def basic_preprocessing(self, df):
        """Basic preprocessing steps"""
        # Convert ID columns to consistent types
        id_cols = ['id1', 'id2', 'id3', 'id4', 'id5']
        for col in id_cols:
            if col in df.columns:
                df[col] = df[col].astype(str)  # Convert all IDs to string type
        
        # Convert other columns to numeric
        for col in df.columns:
            if col not in id_cols:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Handle missing values
        df.fillna(0, inplace=True)
        
        # Handle infinite values
        df.replace([np.inf, -np.inf], 0, inplace=True)
        
        return df
        
    def feature_engineering(self, df):
        """Complete feature engineering pipeline"""
        print("Starting feature engineering...")
        
        # Basic preprocessing
        df = self.basic_preprocessing(df)
        
        # Create features from additional datasets
        df = self.create_event_features(df)
        df = self.create_transaction_features(df)
        df = self.create_offer_features(df)
        
        # Create advanced features
        df = self.create_advanced_features(df)
        
        # Final preprocessing
        df = self.basic_preprocessing(df)
        
        print(f"Feature engineering complete. Shape: {df.shape}")
        return df
    
    def prepare_data(self):
        """Prepare training and test data"""
        print("Preparing data...")
        
        # Apply feature engineering
        self.df_train = self.feature_engineering(self.df_train.copy())
        self.df_test = self.feature_engineering(self.df_test.copy())
        
        # Extract target and IDs
        self.y_train = self.df_train['y'].astype(int)
        self.ids_train = self.df_train[['id1', 'id2', 'id3', 'id4', 'id5']].copy()
        self.ids_test = self.df_test[['id1', 'id2', 'id3', 'id4', 'id5']].copy()
        
        # Drop IDs and target
        id_cols = ['id1', 'id2', 'id3', 'id4', 'id5', 'y']
        self.df_train = self.df_train.drop(columns=[col for col in id_cols if col in self.df_train.columns])
        self.df_test = self.df_test.drop(columns=[col for col in id_cols if col in self.df_test.columns])
        
        # Ensure same columns in train and test
        common_cols = list(set(self.df_train.columns) & set(self.df_test.columns))
        self.df_train = self.df_train[common_cols]
        self.df_test = self.df_test[common_cols]
        
        # Remove low variance features
        numeric_cols = self.df_train.select_dtypes(include=[np.number]).columns
        low_var_cols = []
        for col in numeric_cols:
            if self.df_train[col].std() < 1e-6:
                low_var_cols.append(col)
        
        if low_var_cols:
            self.df_train = self.df_train.drop(columns=low_var_cols)
            self.df_test = self.df_test.drop(columns=low_var_cols)
            print(f"Removed {len(low_var_cols)} low variance features")
        
        self.feature_columns = self.df_train.columns.tolist()
        print(f"Final feature count: {len(self.feature_columns)}")
        
    def train_model(self):
        """Train the XGBoost model with cross-validation"""
        print("Training model...")
        
        # Split for validation
        X_train, X_val, y_train, y_val = train_test_split(
            self.df_train, self.y_train, test_size=0.2, random_state=42, stratify=self.y_train
        )
        
        # XGBoost parameters optimized for ranking
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'learning_rate': 0.05,
            'max_depth': 8,
            'min_child_weight': 1,
            'gamma': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Prepare DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        # Train model
        self.model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dtrain, 'train'), (dval, 'val')],
            early_stopping_rounds=100,
            verbose_eval=100
        )
        
        # Validate
        y_pred = self.model.predict(dval)
        auc_score = roc_auc_score(y_val, y_pred)
        print(f"Validation AUC: {auc_score:.4f}")
        
        return auc_score
    
    def feature_selection(self, top_k=500):
        """Feature selection using SHAP"""
        print("Performing feature selection...")
        
        # Sample for SHAP calculation
        sample_size = min(1000, len(self.df_train))
        sample_idx = np.random.choice(len(self.df_train), sample_size, replace=False)
        X_sample = self.df_train.iloc[sample_idx]
        
        # Calculate SHAP values
        explainer = shap.TreeExplainer(self.model)
        shap_values = explainer.shap_values(X_sample)
        
        # Get feature importance
        feature_importance = np.abs(shap_values).mean(axis=0)
        feature_scores = pd.Series(feature_importance, index=X_sample.columns)
        
        # Select top features
        top_features = feature_scores.nlargest(top_k).index.tolist()
        
        print(f"Selected {len(top_features)} features out of {len(self.feature_columns)}")
        
        # Update datasets
        self.df_train = self.df_train[top_features]
        self.df_test = self.df_test[top_features]
        self.feature_columns = top_features
        
        return top_features
    
    def train_final_model(self):
        """Train final model on selected features"""
        print("Training final model...")
        
        # Parameters for final model
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'learning_rate': 0.03,
            'max_depth': 10,
            'min_child_weight': 1,
            'gamma': 0.1,
            'subsample': 0.9,
            'colsample_bytree': 0.9,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Cross-validation
        cv_scores = []
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        for fold, (train_idx, val_idx) in enumerate(kfold.split(self.df_train, self.y_train)):
            X_train_fold = self.df_train.iloc[train_idx]
            X_val_fold = self.df_train.iloc[val_idx]
            y_train_fold = self.y_train.iloc[train_idx]
            y_val_fold = self.y_train.iloc[val_idx]
            
            dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
            dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
            
            model_fold = xgb.train(
                params,
                dtrain,
                num_boost_round=3000,
                evals=[(dtrain, 'train'), (dval, 'val')],
                early_stopping_rounds=150,
                verbose_eval=False
            )
            
            y_pred = model_fold.predict(dval)
            auc_score = roc_auc_score(y_val_fold, y_pred)
            cv_scores.append(auc_score)
            print(f"Fold {fold+1} AUC: {auc_score:.4f}")
        
        print(f"CV AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores)*2:.4f})")
        
        # Train final model on full data
        dtrain_full = xgb.DMatrix(self.df_train, label=self.y_train)
        
        self.model = xgb.train(
            params,
            dtrain_full,
            num_boost_round=int(np.mean([model.best_iteration for model in [model_fold]]) * 1.1),
            verbose_eval=100
        )
        
        return np.mean(cv_scores)
    
    def predict(self):
        """Make predictions"""
        print("Making predictions...")
        
        dtest = xgb.DMatrix(self.df_test)
        predictions = self.model.predict(dtest)
        
        # Create submission
        submission = pd.DataFrame({
            'id1': self.ids_test['id1'],
            'id2': self.ids_test['id2'],
            'id3': self.ids_test['id3'],
            'id5': self.ids_test['id5'],
            'pred': predictions
        })
        
        return submission
    
    def create_submission(self, predictions_df):
        """Create final submission file"""
        print("Creating submission...")
        
        # Map predictions to submission template
        pred_map = dict(zip(predictions_df['id1'], predictions_df['pred']))
        
        # Load submission template
        submission = self.df_submission.copy()
        submission['y'] = submission['id1'].map(pred_map).fillna(0.5).clip(0, 1)
        
        # Save submission
        submission.to_csv('final_submission.csv', index=False)
        print(f"Submission saved with {len(submission)} rows")
        
        return submission
    
    def run_pipeline(self):
        """Run complete pipeline"""
        print("Starting complete pipeline...")
        
        # Load data
        self.load_data()
        
        # Prepare data
        self.prepare_data()
        
        # Train initial model
        initial_auc = self.train_model()
        
        # Feature selection
        selected_features = self.feature_selection(top_k=400)
        
        # Train final model
        final_auc = self.train_final_model()
        
        # Make predictions
        predictions = self.predict()
        
        # Create submission
        submission = self.create_submission(predictions)
        
        print(f"Pipeline complete!")
        print(f"Initial AUC: {initial_auc:.4f}")
        print(f"Final AUC: {final_auc:.4f}")
        print(f"Submission shape: {submission.shape}")
        
        return submission

# Run the pipeline
if __name__ == "__main__":
    model = ClickPredictionModel()
    submission = model.run_pipeline()
    
    # Display prediction distribution
    plt.figure(figsize=(10, 6))
    plt.hist(submission['y'], bins=50, alpha=0.7, edgecolor='black')
    plt.title('Distribution of Predicted Click Probabilities')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"Prediction statistics:")
    print(f"Mean: {submission['y'].mean():.4f}")
    print(f"Std: {submission['y'].std():.4f}")
    print(f"Min: {submission['y'].min():.4f}")
    print(f"Max: {submission['y'].max():.4f}")

SyntaxError: closing parenthesis ']' does not match opening parenthesis '(' (3972566633.py, line 220)

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import joblib
import shap
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

class ClickPredictionModel:
    def __init__(self):
        self.model = None
        self.feature_columns = None
        self.label_encoders = {}
        self.scaler = StandardScaler()
        
    def load_data(self):
        """Load all datasets"""
        # Load main datasets
        self.df_events = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/add_event.parquet')
        self.df_trans = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/add_trans.parquet')
        self.df_offers = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/offer_metadata.parquet')
        self.df_train = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/train_data.parquet')
        self.df_test = pd.read_parquet('/Users/jatin/Desktop/AMEX Hackathon/test_data.parquet')
        self.df_submission = pd.read_csv('/Users/jatin/Desktop/AMEX Hackathon/submission_template.csv')
        
        # Ensure ID columns are consistent
        for df in [self.df_events, self.df_trans, self.df_offers, self.df_train, self.df_test]:
            if 'id2' in df.columns:
                df['id2'] = df['id2'].astype(str)
        
        print("Data loaded successfully!")
        print(f"Events shape: {self.df_events.shape}")
        print(f"Transactions shape: {self.df_trans.shape}")
        print(f"Offers shape: {self.df_offers.shape}")
        print(f"Train shape: {self.df_train.shape}")
        print(f"Test shape: {self.df_test.shape}")
        
    def create_event_features(self, df):
        """Create features from events data"""
        if hasattr(self, 'df_events') and not self.df_events.empty:
            # Convert timestamp columns to datetime safely
            try:
                self.df_events['id4'] = pd.to_datetime(self.df_events['id4'], errors='coerce')
                self.df_events['id7'] = pd.to_datetime(self.df_events['id7'], errors='coerce')
                self.df_events['timestamp_diff'] = (self.df_events['id7'] - self.df_events['id4']).dt.total_seconds()
            except Exception as e:
                print(f"Error processing timestamps: {e}")
                self.df_events['timestamp_diff'] = 0
            
            # Basic event aggregations
            agg_dict = {
                'id3': ['count', 'nunique'],
                'timestamp_diff': ['min', 'max', 'mean', 'std'],
                'id6': 'nunique'
            }
            
            valid_columns = {k: v for k, v in agg_dict.items() if k in self.df_events.columns}
            events_agg = self.df_events.groupby('id2').agg(valid_columns).reset_index()
            
            # Flatten column names
            events_agg.columns = ['id2'] + [f'event_{col[0]}_{col[1]}' for col in events_agg.columns[1:]]
            
            # Calculate event-based ratios safely
            if 'event_id3_count' in events_agg.columns:
                events_agg['event_click_rate'] = events_agg['event_id3_count'] / (events_agg['event_id3_count'] + 1e-6)
            
            # Merge with main dataframe
            if 'id2' in df.columns:
                df = df.merge(events_agg, on='id2', how='left')
            
            # Customer-offer specific features
            if all(col in self.df_events.columns for col in ['id2', 'id3', 'timestamp_diff', 'id6']):
                customer_offer_agg = {
                    'timestamp_diff': ['count', 'mean'],
                    'id6': 'nunique'
                }
                
                customer_offer_events = self.df_events.groupby(['id2', 'id3']).agg(customer_offer_agg).reset_index()
                customer_offer_events.columns = ['id2', 'id3'] + [f'co_event_{col[0]}_{col[1]}' for col in customer_offer_events.columns[2:]]
                
                if 'co_event_timestamp_diff_count' in customer_offer_events.columns:
                    customer_offer_events['co_event_specific_ctr'] = (
                        customer_offer_events['co_event_timestamp_diff_count'] / 
                        (customer_offer_events['co_event_timestamp_diff_count'] + 1e-6)
                    )
                
                if all(col in df.columns for col in ['id2', 'id3']):
                    df = df.merge(customer_offer_events, on=['id2', 'id3'], how='left')
                    
        return df
    
    def create_transaction_features(self, df):
        """Create features from transaction data"""
        if hasattr(self, 'df_trans') and not self.df_trans.empty:
            # Convert numeric columns to numeric type
            numeric_cols = ['f367', 'f368', 'f369', 'f370', 'f371', 'f372', 'f374']
            for col in numeric_cols:
                if col in self.df_trans.columns:
                    self.df_trans[col] = pd.to_numeric(self.df_trans[col], errors='coerce')
            
            # Handle non-numeric columns safely
            if 'id8' in self.df_trans.columns:
                self.df_trans['id8'] = self.df_trans['id8'].astype(str)
            
            # Aggregate transaction features by customer
            agg_dict = {
                'f367': ['sum', 'mean', 'count', 'std', 'min', 'max'],
                'f368': ['nunique', 'count'],
                'f369': ['sum', 'mean'],
                'f370': ['min', 'max', 'nunique'],
                'f371': ['mean', 'std'],
                'f372': ['nunique'],
                'f374': ['nunique'],
                'id8': ['nunique']
            }
            
            # Only include columns that exist in the dataframe
            valid_columns = {k: v for k, v in agg_dict.items() if k in self.df_trans.columns}
            trans_agg = self.df_trans.groupby('id2').agg(valid_columns).reset_index()
            
            # Flatten column names
            trans_agg.columns = ['id2'] + [f'trans_{col[0]}_{col[1]}' for col in trans_agg.columns[1:]]
            
            # Create advanced transaction features safely
            if 'trans_f367_sum' in trans_agg.columns and 'trans_f367_count' in trans_agg.columns:
                trans_agg['trans_avg_amount'] = trans_agg['trans_f367_sum'] / (trans_agg['trans_f367_count'] + 1e-6)
            
            if 'trans_f367_std' in trans_agg.columns and 'trans_f367_mean' in trans_agg.columns:
                trans_agg['trans_amount_cv'] = trans_agg['trans_f367_std'] / (trans_agg['trans_f367_mean'] + 1e-6)
            
            if 'trans_f367_max' in trans_agg.columns and 'trans_f367_mean' in trans_agg.columns:
                trans_agg['trans_high_value_ratio'] = (trans_agg['trans_f367_max'] > trans_agg['trans_f367_mean'] * 2).astype(int)
            
            if 'trans_f368_nunique' in trans_agg.columns and 'trans_f368_count' in trans_agg.columns:
                trans_agg['trans_product_diversity'] = trans_agg['trans_f368_nunique'] / (trans_agg['trans_f368_count'] + 1e-6)
            
            if 'trans_amount_cv' in trans_agg.columns:
                trans_agg['trans_spending_consistency'] = 1 / (trans_agg['trans_amount_cv'] + 1e-6)
            
            # Merge with main dataframe
            if 'id2' in df.columns:
                df = df.merge(trans_agg, on='id2', how='left')
                
        return df
    
    def create_offer_features(self, df):
        """Create features from offer metadata"""
        if hasattr(self, 'df_offers') and not self.df_offers.empty:
            # Ensure numeric columns are numeric
            numeric_cols = ['f375', 'f376', 'f377', 'f378', 'f374']
            for col in numeric_cols:
                if col in self.df_offers.columns:
                    self.df_offers[col] = pd.to_numeric(self.df_offers[col], errors='coerce')
            
            # Convert datetime columns to datetime objects first
            datetime_cols = ['id12', 'id13']  # Add any other datetime columns here
            for col in datetime_cols:
                if col in self.df_offers.columns:
                    self.df_offers[col] = pd.to_datetime(self.df_offers[col], errors='coerce')
            
            # Define aggregations - separate numeric and datetime aggregations
            numeric_agg_dict = {
                'f375': ['mean', 'std'],
                'f376': ['mean', 'std'],
                'f377': ['mean', 'std'],
                'id10': ['nunique'],
                'id11': ['nunique'],
                'f378': ['mean', 'std'],
                'f374': ['nunique']
            }
            
            datetime_agg_dict = {
                'id12': ['min', 'max'],
                'id13': ['min', 'max']
            }
            
            # Only include columns that exist
            valid_numeric_cols = {k: v for k, v in numeric_agg_dict.items() 
                                if k in self.df_offers.columns}
            valid_datetime_cols = {k: v for k, v in datetime_agg_dict.items() 
                                 if k in self.df_offers.columns}
            
            # Perform aggregations separately
            numeric_agg = self.df_offers.groupby('id3').agg(valid_numeric_cols).reset_index()
            datetime_agg = self.df_offers.groupby('id3').agg(valid_datetime_cols).reset_index()
            
            # Calculate datetime differences if we have both min and max
            if all(f'id13_{agg}' in datetime_agg.columns for agg in ['min', 'max']):
                datetime_agg['offer_duration'] = (
                    datetime_agg['id13_max'] - datetime_agg['id12_min']
                ).dt.total_seconds()
            
            # Merge the aggregations
            offer_agg = numeric_agg.merge(datetime_agg, on='id3', how='left')
            
            # Flatten column names
            offer_agg.columns = ['id3'] + [f'offer_{col[0]}_{col[1]}' 
                                         for col in offer_agg.columns[1:]]
            
            # Create offer-specific features safely
            if 'offer_f376_mean' in offer_agg.columns and 'offer_f375_mean' in offer_agg.columns:
                offer_agg['offer_attractiveness'] = (
                    offer_agg['offer_f376_mean'] * offer_agg['offer_f375_mean']
                )
            
            if 'offer_f378_std' in offer_agg.columns and 'offer_f378_mean' in offer_agg.columns:
                offer_agg['offer_complexity'] = (
                    offer_agg['offer_f378_std'] / (offer_agg['offer_f378_mean'] + 1e-6))
            
            # Merge with main dataframe
            if 'id3' in df.columns:
                df = df.merge(offer_agg, on='id3', how='left')
                
        return df
    
    def create_advanced_features(self, df):
        """Create advanced engineered features"""
        # Make a copy to avoid SettingWithCopyWarning
        df = df.copy()
        
        # Time-based features
        if 'id5' in df.columns:
            df['day_of_week'] = df['id5'] % 7
            df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        
        if 'id4' in df.columns:
            df['hour_of_day'] = (df['id4'] % 86400) // 3600
            df['is_business_hours'] = ((df['hour_of_day'] >= 9) & (df['hour_of_day'] <= 17)).astype(int)
        
        # Customer engagement features
        engagement_features = [f for f in ['f28', 'f29', 'f30', 'f31', 'f147', 'f148', 'f149', 'f150'] 
                             if f in df.columns]
        if engagement_features:
            df['total_engagement'] = df[engagement_features].sum(axis=1)
            df['avg_engagement'] = df[engagement_features].mean(axis=1)
            df['engagement_consistency'] = df[engagement_features].std(axis=1)
        
        # CTR-based features - Fixed syntax error here
        ctr_features = [col for col in df.columns 
                       if 'ctr' in col.lower() or 
                       (col.startswith('f1') and col[1:].isdigit() and int(col[1:]) in range(4, 10))]
        if ctr_features:
            df['avg_ctr'] = df[ctr_features].mean(axis=1)
            df['max_ctr'] = df[ctr_features].max(axis=1)
            df['ctr_consistency'] = df[ctr_features].std(axis=1)
        
        # Spending pattern features
        spending_features = [f for f in ['f39', 'f40', 'f41'] if f in df.columns]
        if spending_features:
            df['total_spending'] = df[spending_features].sum(axis=1)
            df['spending_diversity'] = (df[spending_features] > 0).sum(axis=1)
            if len(spending_features) > 0:
                df['dominant_spending_category'] = df[spending_features].idxmax(axis=1)
        
        # Interaction features with existence checks
        if all(f in df.columns for f in ['f363', 'f331']):
            df['f363_x_f331'] = df['f363'] * df['f331']
        
        if all(f in df.columns for f in ['f366', 'f329']):
            df['f366_x_f329'] = df['f366'] * df['f329']
        
        if all(f in df.columns for f in ['f150', 'f329']):
            df['f150_x_f329'] = df['f150'] * df['f329']
        
        if all(f in df.columns for f in ['f138', 'f22']):
            df['f138_x_f22'] = df['f138'] * df['f22']
        
        if all(f in df.columns for f in ['f132', 'f68']):
            df['f132_x_f68'] = df['f132'] * df['f68']
        
        if all(f in df.columns for f in ['f363', 'f366']):
            df['f363_x_f366'] = df['f363'] * df['f366']
        
        # Ratio features
        if all(f in df.columns for f in ['f363', 'f329']):
            df['f363_div_f329'] = df['f363'] / (df['f329'] + 1e-6)
        
        if all(f in df.columns for f in ['f366', 'f329']):
            df['f366_div_f329'] = df['f366'] / (df['f329'] + 1e-6)
        
        if all(f in df.columns for f in ['f214', 'f22']):
            df['f214_to_f22'] = df['f214'] / (df['f22'] + 1e-6)
        
        # Higher order features
        if 'f132' in df.columns:
            df['f132_sq'] = df['f132'] ** 2
        
        if 'f363' in df.columns:
            df['f363_sq'] = df['f363'] ** 2
        
        if 'f366' in df.columns:
            df['f366_sq'] = df['f366'] ** 2
        
        if 'f363' in df.columns:
            df['f363_log'] = np.log1p(df['f363'].clip(lower=0))
        
        if 'f366' in df.columns:
            df['f366_inv'] = 1.0 / (df['f366'] + 1e-6)
        
        if 'f351' in df.columns:
            df['f351_log'] = np.log1p(df['f351'])
        
        # Binning important features
        if 'f363' in df.columns:
            df['f363_bin'] = pd.cut(df['f363'], [-np.inf, 0.1, 0.25, np.inf], labels=[0, 1, 2]).astype(int)
        
        if 'f366' in df.columns:
            df['f366_bin'] = pd.cut(df['f366'], [-np.inf, 0.1, 0.3, np.inf], labels=[0, 1, 2]).astype(int)
        
        # Customer value segmentation
        value_features = [f for f in ['f43', 'f44', 'f47', 'f49'] if f in df.columns]
        if value_features:
            df['customer_value_score'] = df[value_features].sum(axis=1)
            if len(df) > 0:  # Check we have data to calculate quantile
                df['is_high_value_customer'] = (df['customer_value_score'] > df['customer_value_score'].quantile(0.8)).astype(int)
        
        # Offer-specific customer features
        if all(f in df.columns for f in ['f363', 'f366', 'f150']):
            df['customer_offer_match_score'] = df['f363'] * df['f366'] * df['f150']
        
        if 'f223' in df.columns:
            df['recency_score'] = np.exp(-df['f223'] / 30)
        
        # PCA features
        low_imp_features = [col for col in df.columns 
                          if col.startswith('f') and col[1:].isdigit() and int(col[1:]) > 300]
        if len(low_imp_features) > 1:
            try:
                pca = PCA(n_components=min(5, len(low_imp_features)))
                df_pca = pca.fit_transform(df[low_imp_features].fillna(0))
                for i in range(df_pca.shape[1]):
                    df[f'pca_component_{i}'] = df_pca[:, i]
            except Exception as e:
                print(f"PCA failed: {e}")
        
        # Clustering features
        cluster_features = [f for f in ['f363', 'f366', 'f150', 'f138'] if f in df.columns]
        if len(cluster_features) >= 2:
            try:
                n_clusters = min(5, len(df)//10)
                if n_clusters > 1:
                    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
                    df['customer_cluster'] = kmeans.fit_predict(df[cluster_features].fillna(0))
            except Exception as e:
                print(f"Clustering failed: {e}")
        
        return df
    
    def basic_preprocessing(self, df):
        """Basic preprocessing steps"""
        # Convert ID columns to consistent types
        id_cols = ['id1', 'id2', 'id3', 'id4', 'id5']
        for col in id_cols:
            if col in df.columns:
                df[col] = df[col].astype(str)  # Convert all IDs to string type
        
        # Convert other columns to numeric
        for col in df.columns:
            if col not in id_cols:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        
        # Handle missing values
        df.fillna(0, inplace=True)
        
        # Handle infinite values
        df.replace([np.inf, -np.inf], 0, inplace=True)
        
        return df
        
    def feature_engineering(self, df):
        """Complete feature engineering pipeline"""
        print("Starting feature engineering...")
        
        # Basic preprocessing
        df = self.basic_preprocessing(df)
        
        # Create features from additional datasets
        df = self.create_event_features(df)
        df = self.create_transaction_features(df)
        df = self.create_offer_features(df)
        
        # Create advanced features
        df = self.create_advanced_features(df)
        
        # Final preprocessing
        df = self.basic_preprocessing(df)
        
        print(f"Feature engineering complete. Shape: {df.shape}")
        return df
    
    def prepare_data(self):
        """Prepare training and test data"""
        print("Preparing data...")
        
        # Apply feature engineering
        self.df_train = self.feature_engineering(self.df_train.copy())
        self.df_test = self.feature_engineering(self.df_test.copy())
        
        # Extract target and IDs
        self.y_train = self.df_train['y'].astype(int)
        self.ids_train = self.df_train[['id1', 'id2', 'id3', 'id4', 'id5']].copy()
        self.ids_test = self.df_test[['id1', 'id2', 'id3', 'id4', 'id5']].copy()
        
        # Drop IDs and target
        id_cols = ['id1', 'id2', 'id3', 'id4', 'id5', 'y']
        self.df_train = self.df_train.drop(columns=[col for col in id_cols if col in self.df_train.columns])
        self.df_test = self.df_test.drop(columns=[col for col in id_cols if col in self.df_test.columns])
        
        # Ensure same columns in train and test
        common_cols = list(set(self.df_train.columns) & set(self.df_test.columns))
        self.df_train = self.df_train[common_cols]
        self.df_test = self.df_test[common_cols]
        
        # Remove low variance features
        numeric_cols = self.df_train.select_dtypes(include=[np.number]).columns
        low_var_cols = []
        for col in numeric_cols:
            if self.df_train[col].std() < 1e-6:
                low_var_cols.append(col)
        
        if low_var_cols:
            self.df_train = self.df_train.drop(columns=low_var_cols)
            self.df_test = self.df_test.drop(columns=low_var_cols)
            print(f"Removed {len(low_var_cols)} low variance features")
        
        self.feature_columns = self.df_train.columns.tolist()
        print(f"Final feature count: {len(self.feature_columns)}")
        
    def train_model(self):
        """Train the XGBoost model with cross-validation"""
        print("Training model...")
        
        # Split for validation
        X_train, X_val, y_train, y_val = train_test_split(
            self.df_train, self.y_train, test_size=0.2, random_state=42, stratify=self.y_train
        )
        
        # XGBoost parameters optimized for ranking
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'learning_rate': 0.05,
            'max_depth': 8,
            'min_child_weight': 1,
            'gamma': 0.1,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Prepare DMatrix
        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)
        
        # Train model
        self.model = xgb.train(
            params,
            dtrain,
            num_boost_round=2000,
            evals=[(dtrain, 'train'), (dval, 'val')],
            early_stopping_rounds=100,
            verbose_eval=100
        )
        
        # Validate
        y_pred = self.model.predict(dval)
        auc_score = roc_auc_score(y_val, y_pred)
        print(f"Validation AUC: {auc_score:.4f}")
        
        return auc_score
    
    def feature_selection(self, top_k=500):
        """Feature selection using SHAP"""
        print("Performing feature selection...")
        
        # Sample for SHAP calculation
        sample_size = min(1000, len(self.df_train))
        if sample_size == 0:
            return self.feature_columns[:top_k]
            
        sample_idx = np.random.choice(len(self.df_train), sample_size, replace=False)
        X_sample = self.df_train.iloc[sample_idx]
        
        try:
            # Calculate SHAP values
            explainer = shap.TreeExplainer(self.model)
            shap_values = explainer.shap_values(X_sample)
            
            # Get feature importance
            feature_importance = np.abs(shap_values).mean(axis=0)
            feature_scores = pd.Series(feature_importance, index=X_sample.columns)
            
            # Select top features
            top_features = feature_scores.nlargest(top_k).index.tolist()
            
            print(f"Selected {len(top_features)} features out of {len(self.feature_columns)}")
            
            # Update datasets
            self.df_train = self.df_train[top_features]
            self.df_test = self.df_test[top_features]
            self.feature_columns = top_features
            
            return top_features
        except Exception as e:
            print(f"SHAP failed, using all features: {e}")
            return self.feature_columns[:top_k]
    
    def train_final_model(self):
        """Train final model on selected features"""
        print("Training final model...")
        
        # Parameters for final model
        params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'learning_rate': 0.03,
            'max_depth': 10,
            'min_child_weight': 1,
            'gamma': 0.1,
            'subsample': 0.9,
            'colsample_bytree': 0.9,
            'reg_alpha': 0.1,
            'reg_lambda': 1.0,
            'random_state': 42,
            'n_jobs': -1
        }
        
        # Cross-validation
        cv_scores = []
        kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        for fold, (train_idx, val_idx) in enumerate(kfold.split(self.df_train, self.y_train)):
            X_train_fold = self.df_train.iloc[train_idx]
            X_val_fold = self.df_train.iloc[val_idx]
            y_train_fold = self.y_train.iloc[train_idx]
            y_val_fold = self.y_train.iloc[val_idx]
            
            dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
            dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
            
            model_fold = xgb.train(
                params,
                dtrain,
                num_boost_round=3000,
                evals=[(dtrain, 'train'), (dval, 'val')],
                early_stopping_rounds=150,
                verbose_eval=False
            )
            
            y_pred = model_fold.predict(dval)
            auc_score = roc_auc_score(y_val_fold, y_pred)
            cv_scores.append(auc_score)
            print(f"Fold {fold+1} AUC: {auc_score:.4f}")
        
        print(f"CV AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores)*2:.4f})")
        
        # Train final model on full data
        dtrain_full = xgb.DMatrix(self.df_train, label=self.y_train)
        
        self.model = xgb.train(
            params,
            dtrain_full,
            num_boost_round=int(np.mean([model.best_iteration for model in [model_fold]]) * 1.1),
            verbose_eval=100
        )
        
        return np.mean(cv_scores)
    
    def predict(self):
        """Make predictions"""
        print("Making predictions...")
        
        dtest = xgb.DMatrix(self.df_test)
        predictions = self.model.predict(dtest)
        
        # Create submission
        submission = pd.DataFrame({
            'id1': self.ids_test['id1'],
            'id2': self.ids_test['id2'],
            'id3': self.ids_test['id3'],
            'id5': self.ids_test['id5'],
            'pred': predictions
        })
        
        return submission
    
    def create_submission(self, predictions_df):
        """Create final submission file"""
        print("Creating submission...")
        
        # Map predictions to submission template
        pred_map = dict(zip(predictions_df['id1'], predictions_df['pred']))
        
        # Load submission template
        submission = self.df_submission.copy()
        submission['y'] = submission['id1'].map(pred_map).fillna(0.5).clip(0, 1)
        
        # Save submission
        submission.to_csv('final_submission.csv', index=False)
        print(f"Submission saved with {len(submission)} rows")
        
        return submission
    
    def run_pipeline(self):
        """Run complete pipeline"""
        print("Starting complete pipeline...")
        
        # Load data
        self.load_data()
        
        # Prepare data
        self.prepare_data()
        
        # Train initial model
        initial_auc = self.train_model()
        
        # Feature selection
        selected_features = self.feature_selection(top_k=400)
        
        # Train final model
        final_auc = self.train_final_model()
        
        # Make predictions
        predictions = self.predict()
        
        # Create submission
        submission = self.create_submission(predictions)
        
        print(f"Pipeline complete!")
        print(f"Initial AUC: {initial_auc:.4f}")
        print(f"Final AUC: {final_auc:.4f}")
        print(f"Submission shape: {submission.shape}")
        
        return submission

# Run the pipeline
if __name__ == "__main__":
    model = ClickPredictionModel()
    submission = model.run_pipeline()
    
    # Display prediction distribution
    plt.figure(figsize=(10, 6))
    plt.hist(submission['y'], bins=50, alpha=0.7, edgecolor='black')
    plt.title('Distribution of Predicted Click Probabilities')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Frequency')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"Prediction statistics:")
    print(f"Mean: {submission['y'].mean():.4f}")
    print(f"Std: {submission['y'].std():.4f}")
    print(f"Min: {submission['y'].min():.4f}")
    print(f"Max: {submission['y'].max():.4f}")

Starting complete pipeline...
Data loaded successfully!
Events shape: (21457473, 5)
Transactions shape: (6339465, 9)
Offers shape: (4164, 12)
Train shape: (770164, 372)
Test shape: (369301, 371)
Preparing data...
Starting feature engineering...


ValueError: You are trying to merge on object and int32 columns for key 'id3'. If you wish to proceed you should use pd.concat