# Fourth Submission: Rework all the features + ensemble models

## Import the packages

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

train = pd.read_parquet('train.parquet')
test = pd.read_parquet('test.parquet')

print(f" Train shape: {train.shape}")
print(f" Test shape: {test.shape}")

 Train shape: (17499636, 19)
 Test shape: (4393179, 19)


## New features : more trends, etc

In [2]:
def build_churn_features(df, is_train=True, reference_date='2018-11-20'):

    reference_ts = pd.Timestamp(reference_date)

    if is_train:
        df_features = df[df['page'] != 'Cancellation Confirmation'].copy()
        print(f"  Removed {len(df) - len(df_features)} cancellation events (leakage prevention)")
    else:
        df_features = df.copy()

    df_features['ts'] = pd.to_datetime(df_features['ts'])
    df_features['date'] = df_features['ts'].dt.date
    df_features['hour'] = df_features['ts'].dt.hour
    df_features['dayofweek'] = df_features['ts'].dt.dayofweek
    df_features['is_weekend'] = df_features['dayofweek'].isin([5, 6]).astype(int)
    df_features['days_from_start'] = (df_features['ts'] - df_features['ts'].min()).dt.days

    df_features['quartile'] = pd.cut(df_features['days_from_start'], bins=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

    user_ids = df_features['userId'].unique()
    features = pd.DataFrame({'userId': user_ids})
    print(f"Processing {len(user_ids):,} users...")

    print(" Step 1/10: Basic counting features...")
    features = features.merge(df_features.groupby('userId').size().rename('total_events'), on='userId', how='left')
    total_days = (df_features['ts'].max() - df_features['ts'].min()).days + 1
    features['events_per_day'] = features['total_events'] / total_days
    features = features.merge(df_features.groupby('userId')['date'].nunique().rename('days_active'), on='userId', how='left')
    features['activity_rate'] = features['days_active'] / total_days

    print(" Step 2/10: Recency features...")
    last_activity = df_features.groupby('userId')['ts'].max()
    features = features.merge(last_activity.rename('last_ts'), on='userId', how='left')
    features['days_since_last_activity'] = (reference_ts - features['last_ts']).dt.days
    features['recency_score'] = 1 / (features['days_since_last_activity'] + 1)

    last_song = df_features[df_features['page'] == 'NextSong'].groupby('userId')['ts'].max()
    features = features.merge(last_song.rename('last_song_ts'), on='userId', how='left')
    features['days_since_last_song'] = (reference_ts - features['last_song_ts']).dt.days
    features['days_since_last_song'] = features['days_since_last_song'].fillna(999)

    print(" Step 3/10: Temporal trend features...")
    quartile_counts = df_features.groupby(['userId', 'quartile']).size().unstack(fill_value=0)
    quartile_counts.columns = [f'activity_{col}' for col in quartile_counts.columns]
    features = features.merge(quartile_counts, on='userId', how='left')

    for q in ['Q1', 'Q2', 'Q3', 'Q4']:
        if f'activity_{q}' not in features.columns:
            features[f'activity_{q}'] = 0

    features['decline_ratio'] = (features['activity_Q1'] - features['activity_Q4']) / (features['total_events'] + 1)
    features['early_late_ratio'] = features['activity_Q1'] / (features['activity_Q4'] + 1)
    features['q4_ratio'] = features['activity_Q4'] / (features['total_events'] + 1)

    print(" Step 4/10: Page-level features...")
    page_counts = df_features.groupby(['userId', 'page']).size().unstack(fill_value=0)
    page_totals = page_counts.sum(axis=1)
    page_ratios = page_counts.div(page_totals, axis=0)
    page_ratios.columns = [f'{col.lower().replace(" ", "_")}_ratio' for col in page_ratios.columns]
    features = features.merge(page_ratios, on='userId', how='left')
    features = features.fillna(0)

    print(" Step 5/10: Downgrade detection...")
    df_sorted = df_features.sort_values(['userId', 'ts'])
    df_sorted['prev_level'] = df_sorted.groupby('userId')['level'].shift(1)
    df_sorted['downgrade'] = ((df_sorted['prev_level'] == 'paid') & (df_sorted['level'] == 'free')).astype(int)
    has_downgraded = df_sorted.groupby('userId')['downgrade'].sum() > 0
    features = features.merge(has_downgraded.astype(int).rename('has_downgraded'), on='userId', how='left')
    features['has_downgraded'] = features['has_downgraded'].fillna(0)

    paid_events = df_features[df_features['level'] == 'paid'].groupby('userId').size()
    features = features.merge(paid_events.rename('paid_events'), on='userId', how='left')
    features['paid_events'] = features['paid_events'].fillna(0)
    features['paid_ratio'] = features['paid_events'] / (features['total_events'] + 1)

    print(" Step 6/10: Session features...")
    df_sorted = df_features.sort_values(['userId', 'ts'])
    df_sorted['time_diff_minutes'] = df_sorted.groupby('userId')['ts'].diff().dt.total_seconds() / 60
    df_sorted['new_session'] = (df_sorted['time_diff_minutes'] > 30) | df_sorted['time_diff_minutes'].isna()
    df_sorted['session_id'] = df_sorted.groupby('userId')['new_session'].cumsum()

    session_lengths = df_sorted.groupby(['userId', 'session_id']).size()
    session_stats = session_lengths.groupby('userId').agg(['count', 'mean', 'std', 'max'])
    session_stats.columns = ['num_sessions', 'avg_session_length', 'session_length_std', 'max_session_length']
    features = features.merge(session_stats, on='userId', how='left')
    features['sessions_per_day'] = features['num_sessions'] / total_days

    print(" Step 7/10: Engagement features...")
    positive_cols = ['thumbs_up_ratio', 'add_to_playlist_ratio', 'add_friend_ratio']
    existing_positive = [col for col in positive_cols if col in features.columns]
    features['positive_engagement'] = features[existing_positive].sum(axis=1) if existing_positive else 0

    negative_cols = ['thumbs_down_ratio', 'error_ratio']
    existing_negative = [col for col in negative_cols if col in features.columns]
    features['negative_engagement'] = features[existing_negative].sum(axis=1) if existing_negative else 0

    features['net_engagement'] = features['positive_engagement'] - features['negative_engagement']

    print(" Step 8/10: Time-of-day features...")
    weekend_events = df_features[df_features['is_weekend'] == 1].groupby('userId').size()
    features = features.merge(weekend_events.rename('weekend_events'), on='userId', how='left')
    features['weekend_events'] = features['weekend_events'].fillna(0)
    features['weekend_ratio'] = features['weekend_events'] / (features['total_events'] + 1)

    print(" Step 9/10: Rolling window features...")
    last_7_days = df_features[df_features['ts'] >= (reference_ts - pd.Timedelta(days=7))]
    last_7_counts = last_7_days.groupby('userId').size().rename('last_7_days_events')
    features = features.merge(last_7_counts, on='userId', how='left')
    features['last_7_days_events'] = features['last_7_days_events'].fillna(0)
    features['last_7_days_ratio'] = features['last_7_days_events'] / (features['total_events'] + 1)

    features['engagement_volatility'] = df_features.groupby('userId')['hour'].std()

    unique_pages = df_features.groupby('userId')['page'].nunique()
    features['page_diversity'] = unique_pages / (features['total_events'] + 1)

    first_activity = df_features.groupby('userId')['ts'].min()
    features['days_since_registration'] = (reference_ts - first_activity).dt.total_seconds() / 86400

    last_3_days = df_features[df_features['ts'] >= (reference_ts - pd.Timedelta(days=3))]
    prev_7_days = df_features[(df_features['ts'] >= (reference_ts - pd.Timedelta(days=10))) &
                            (df_features['ts'] < (reference_ts - pd.Timedelta(days=3)))]

    last_3_counts = last_3_days.groupby('userId').size() / 3
    prev_7_counts = prev_7_days.groupby('userId').size() / 7
    features['activity_acceleration'] = (last_3_counts - prev_7_counts) / (prev_7_counts + 1)

    recent_errors = df_features[(df_features['ts'] >= (reference_ts - pd.Timedelta(days=7))) &
                                (df_features['page'] == 'Error')]
    features['recent_error_ratio'] = recent_errors.groupby('userId').size() / (features['last_7_days_events'] + 1)



    print(" Step 10/10: RFM score...")
    features['recency_percentile'] = features['recency_score'].rank(pct=True)
    features['frequency_percentile'] = features['events_per_day'].rank(pct=True)
    features['monetary_percentile'] = features['paid_ratio'].rank(pct=True)
    features['rfm_score'] = (features['recency_percentile'] + features['frequency_percentile'] + features['monetary_percentile']) / 3

    features = features.drop(columns=['last_ts', 'last_song_ts'], errors='ignore')
    features = features.fillna(0)
    features = features.replace([np.inf, -np.inf], 0)

    print(f" Created {len(features.columns)-1} features for {len(features):,} users")

    return features

X_train_full = build_churn_features(train, is_train=True)
X_test_full = build_churn_features(test, is_train=False)


  Removed 4271 cancellation events (leakage prevention)
Processing 19,140 users...
 Step 1/10: Basic counting features...
 Step 2/10: Recency features...
 Step 3/10: Temporal trend features...
 Step 4/10: Page-level features...
 Step 5/10: Downgrade detection...
 Step 6/10: Session features...
 Step 7/10: Engagement features...
 Step 8/10: Time-of-day features...
 Step 9/10: Rolling window features...
 Step 10/10: RFM score...
 Created 56 features for 19,140 users
Processing 2,904 users...
 Step 1/10: Basic counting features...
 Step 2/10: Recency features...
 Step 3/10: Temporal trend features...
 Step 4/10: Page-level features...
 Step 5/10: Downgrade detection...
 Step 6/10: Session features...
 Step 7/10: Engagement features...
 Step 8/10: Time-of-day features...
 Step 9/10: Rolling window features...
 Step 10/10: RFM score...
 Created 58 features for 2,904 users


## Model stacking

In [3]:
churners = train[train['page'] == 'Cancellation Confirmation']['userId'].unique()
y_train = X_train_full['userId'].isin(churners).astype(int)

print(f" Churn rate: {y_train.mean():.2%}")
print(f"   Churners: {y_train.sum():,}")
print(f"   Non-churners: {(len(y_train) - y_train.sum()):,}")

user_ids_train = X_train_full['userId']
user_ids_test = X_test_full['userId']

X_train = X_train_full.drop('userId', axis=1)
X_test = X_test_full.drop('userId', axis=1)

common_cols = X_train.columns.intersection(X_test.columns)
X_train = X_train[common_cols]
X_test = X_test[common_cols]

print(f"\n Final shapes:")
print(f"   X_train: {X_train.shape}")
print(f"   X_test: {X_test.shape}")
print(f"   Features: {X_train.shape[1]}")


 Churn rate: 22.31%
   Churners: 4,271
   Non-churners: 14,869

 Final shapes:
   X_train: (19140, 55)
   X_test: (2904, 55)
   Features: 55


In [4]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import lightgbm as lgb


 #   X_train = X_train.drop('location', axis=1)
 #   X_test = X_test.drop('location', axis=1)

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

print("\n Training XGBoost...")
xgb_params = {
    'max_depth': 6,
    'learning_rate': 0.01,
    'n_estimators': 500,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'scale_pos_weight': (len(y_train) - y_train.sum()) / y_train.sum(),
    'random_state': 42,
    'tree_method': 'hist'
}

xgb_oof = np.zeros(len(X_train))
xgb_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"   Fold {fold + 1}/{n_splits}...", end=' ')

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = xgb.XGBClassifier(**xgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

    xgb_oof[val_idx] = model.predict_proba(X_val)[:, 1]
    xgb_test += model.predict_proba(X_test)[:, 1] / n_splits

    fold_auc = roc_auc_score(y_val, xgb_oof[val_idx])
    print(f"AUC: {fold_auc:.4f}")

xgb_auc = roc_auc_score(y_train, xgb_oof)
print(f" XGBoost OOF AUC: {xgb_auc:.4f}")

print("\n Training LightGBM...")
lgb_params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.01,
    'num_leaves': 31,
    'max_depth': 6,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'scale_pos_weight': (len(y_train) - y_train.sum()) / y_train.sum(),
    'n_estimators': 500,
    'random_state': 42,
    'verbose': -1
}

lgb_oof = np.zeros(len(X_train))
lgb_test = np.zeros(len(X_test))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
    print(f"   Fold {fold + 1}/{n_splits}...", end=' ')

    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    model = lgb.LGBMClassifier(**lgb_params)
    model.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])

    lgb_oof[val_idx] = model.predict_proba(X_val)[:, 1]
    lgb_test += model.predict_proba(X_test)[:, 1] / n_splits

    fold_auc = roc_auc_score(y_val, lgb_oof[val_idx])
    print(f"AUC: {fold_auc:.4f}")

lgb_auc = roc_auc_score(y_train, lgb_oof)
print(f" LightGBM OOF AUC: {lgb_auc:.4f}")


ensemble_oof = (xgb_oof + lgb_oof) / 2
ensemble_test = (xgb_test + lgb_test) / 2

ensemble_auc = roc_auc_score(y_train, ensemble_oof)
print(f" Ensemble OOF AUC: {ensemble_auc:.4f}")



 Training XGBoost...
   Fold 1/5... AUC: 0.7387
   Fold 2/5... AUC: 0.7458
   Fold 3/5... AUC: 0.7532
   Fold 4/5... AUC: 0.7573
   Fold 5/5... AUC: 0.7657
 XGBoost OOF AUC: 0.7519

 Training LightGBM...
   Fold 1/5... Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[454]	valid_0's auc: 0.738398
AUC: 0.7384
   Fold 2/5... Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[500]	valid_0's auc: 0.748194
AUC: 0.7482
   Fold 3/5... Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[493]	valid_0's auc: 0.754066
AUC: 0.7541
   Fold 4/5... Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[491]	valid_0's auc: 0.760721
AUC: 0.7607
   Fold 5/5... Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[463]	v

In [5]:
threshold = 0.5
binary_predictions = (ensemble_test >= threshold).astype(int)

submission = pd.DataFrame({
    'id': user_ids_test,
    'prediction': binary_predictions
})

submission.to_csv('churn_submission_4.csv', index=False)

predicted_churn_pct = binary_predictions.mean() * 100

print(f"   Shape: {submission.shape}")
print(f"   Predicted churn rate: {predicted_churn_pct:.2f}%")
print(f"   Predicted churners: {binary_predictions.sum():,}")
print(f"   Predicted non-churners: {(len(binary_predictions) - binary_predictions.sum()):,}")

print(f"   XGBoost AUC: {xgb_auc:.4f}")
print(f"   LightGBM AUC: {lgb_auc:.4f}")
print(f"   Ensemble AUC: {ensemble_auc:.4f}")
print(f"   Training churn rate: {y_train.mean()*100:.2f}%")

print(submission.head(10))


   Shape: (2904, 2)
   Predicted churn rate: 34.33%
   Predicted churners: 997
   Predicted non-churners: 1,907
   XGBoost AUC: 0.7519
   LightGBM AUC: 0.7532
   Ensemble AUC: 0.7532
   Training churn rate: 22.31%
        id  prediction
0  1465194           1
1  1261737           0
2  1527155           0
3  1507202           1
4  1429412           0
5  1778785           1
6  1776591           1
7  1937373           1
8  1959334           1
9  1138878           1


In [6]:

threshold = 0.3
binary_predictions = (ensemble_test >= threshold).astype(int)

submission = pd.DataFrame({
    'id': user_ids_test,
    'prediction': binary_predictions
})

submission.to_csv('churn_submission_4_0.3.csv', index=False)

predicted_churn_pct = binary_predictions.mean() * 100

print(f"   Shape: {submission.shape}")
print(f"   Predicted churn rate: {predicted_churn_pct:.2f}%")
print(f"   Predicted churners: {binary_predictions.sum():,}")
print(f"   Predicted non-churners: {(len(binary_predictions) - binary_predictions.sum()):,}")


print(f"   XGBoost AUC: {xgb_auc:.4f}")
print(f"   LightGBM AUC: {lgb_auc:.4f}")
print(f"   Ensemble AUC: {ensemble_auc:.4f}")
print(f"   Training churn rate: {y_train.mean()*100:.2f}%")

print(submission.head(10))


   Shape: (2904, 2)
   Predicted churn rate: 71.45%
   Predicted churners: 2,075
   Predicted non-churners: 829
   XGBoost AUC: 0.7519
   LightGBM AUC: 0.7532
   Ensemble AUC: 0.7532
   Training churn rate: 22.31%
        id  prediction
0  1465194           1
1  1261737           1
2  1527155           1
3  1507202           1
4  1429412           1
5  1778785           1
6  1776591           1
7  1937373           1
8  1959334           1
9  1138878           1


# Score Kaggle : 0.628