# Third Submission: Stacking Ensemble

## Import the packages

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from xgboost import XGBClassifier
import lightgbm as lgb


plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

In [41]:
train = pd.read_parquet('train.parquet', engine="fastparquet")
test = pd.read_parquet('test.parquet', engine="fastparquet")

print(f"\n Train shape: {train.shape}")
print(f" Test shape: {test.shape}")


 Train shape: (17499636, 19)
 Test shape: (4393179, 19)


In [42]:
user_churn = train.groupby('userId')['page'].apply(
    lambda x: 1 if 'Cancellation Confirmation' in x.values else 0
).reset_index()
user_churn.columns = ['userId', 'is_churner']

print(user_churn['is_churner'].value_counts())
print(f"Taux de churn: {user_churn['is_churner'].mean():.2%}")

is_churner
0    14869
1     4271
Name: count, dtype: int64
Taux de churn: 22.31%


## Features engineering

In [43]:
train['time'] = pd.to_datetime(train['ts'], unit='ms')
test['time'] = pd.to_datetime(test['ts'], unit='ms')

columns_to_drop = [
    'firstName', 'lastName', 'status',
    'ts', 
    'auth',
    'method',
    'song',
    'location',
    'userAgent' 
]

train_clean = train.drop(columns=columns_to_drop)
test_clean = test.drop(columns=columns_to_drop)

print(f"Train shape après nettoyage: {train_clean.shape}")
print(f"Test shape après nettoyage: {test_clean.shape}")

def create_unbiased_features(df, user_churn_df=None, is_train=True):
    features = []
    
    for user_id, user_df in df.groupby('userId'):
        user_features = {'userId': user_id}
        
        user_features['gender'] = user_df['gender'].iloc[0]
        total_events = len(user_df)
        user_features['total_events'] = total_events
        user_features['num_sessions'] = user_df['sessionId'].nunique()
        
        time_min = user_df['time'].min()
        registration_time = user_df['registration'].iloc[0]
        user_features['days_since_registration'] = (time_min - registration_time).days
        
        active_days = user_df['time'].dt.date.nunique()
        user_features['active_days'] = active_days
        
        user_features['paid_ratio'] = (user_df['level'] == 'paid').mean()
        
        page_counts = user_df['page'].value_counts()
        
        thumbs_up = page_counts.get('Thumbs Up', 0)
        thumbs_down = page_counts.get('Thumbs Down', 0)
        user_features['thumbs_up_ratio'] = thumbs_up / total_events
        user_features['thumbs_down_ratio'] = thumbs_down / total_events
        user_features['add_playlist_ratio'] = page_counts.get('Add to Playlist', 0) / total_events
        user_features['add_friend_ratio'] = page_counts.get('Add Friend', 0) / total_events
        
        user_features['engagement_score'] = (thumbs_up - thumbs_down) / (thumbs_up + thumbs_down + 1)
        
        user_features['help_ratio'] = page_counts.get('Help', 0) / total_events
        user_features['error_ratio'] = page_counts.get('Error', 0) / total_events
        user_features['about_ratio'] = page_counts.get('About', 0) / total_events
        user_features['settings_ratio'] = page_counts.get('Settings', 0) / total_events
        
        session_lengths = user_df.groupby('sessionId').size()
        user_features['avg_session_length'] = session_lengths.mean()
        user_features['max_session_length'] = session_lengths.max()
        user_features['std_session_length'] = session_lengths.std() if len(session_lengths) > 1 else 0
        
        if 'artist' in user_df.columns:
            user_features['unique_artists'] = user_df[user_df['artist'].notna()]['artist'].nunique()
        else:
            user_features['unique_artists'] = 0
        
        nextsong_count = page_counts.get('NextSong', 0)
        user_features['nextsong_ratio'] = nextsong_count / total_events
        
        if 'length' in user_df.columns:
            avg_length = user_df[user_df['length'].notna()]['length'].mean()
            user_features['avg_song_length'] = avg_length if not pd.isna(avg_length) else 0
        else:
            user_features['avg_song_length'] = 0
        
        daily_events = user_df.groupby(user_df['time'].dt.date).size()
        user_features['events_per_active_day'] = total_events / active_days if active_days > 0 else 0
        user_features['activity_variance'] = daily_events.var() if len(daily_events) > 1 else 0
        
        user_features['events_per_session'] = total_events / user_features['num_sessions']
        user_features['sessions_per_active_day'] = user_features['num_sessions'] / active_days if active_days > 0 else 0
        
        if is_train and user_churn_df is not None:
            churn_value = user_churn_df[user_churn_df['userId'] == user_id]['is_churner'].values
            user_features['churn'] = churn_value[0] if len(churn_value) > 0 else 0
        
        features.append(user_features)
    
    return pd.DataFrame(features)


Train shape après nettoyage: (17499636, 10)
Test shape après nettoyage: (4393179, 10)


## Model with stacking : Ensemble -> XGBoost + LightGBM + RF

In [44]:
train_features = create_unbiased_features(train_clean, user_churn_df=user_churn, is_train=True)
test_features = create_unbiased_features(test_clean, user_churn_df=None, is_train=False)

train_features_encoded = train_features.copy()
train_features_encoded['gender'] = train_features_encoded['gender'].map({'M': 1, 'F': 0})

test_features_encoded = test_features.copy()
test_features_encoded['gender'] = test_features_encoded['gender'].map({'M': 1, 'F': 0})

X_full_v2 = train_features_encoded.drop(['userId', 'churn'], axis=1)
y_full_v2 = train_features_encoded['churn']
X_test_v2 = test_features_encoded.drop(['userId'], axis=1)

for col in X_full_v2.columns:
    if col not in X_test_v2.columns:
        X_test_v2[col] = 0
X_test_v2 = X_test_v2[X_full_v2.columns]

X_train_v2, X_val_v2, y_train_v2, y_val_v2 = train_test_split(
    X_full_v2, y_full_v2, test_size=0.2, random_state=42, stratify=y_full_v2
)

scale_pos_weight = (y_full_v2 == 0).sum() / (y_full_v2 == 1).sum()

print(f"Train shape: {X_train_v2.shape}")
print(f"Val shape: {X_val_v2.shape}")
print(f"Test shape: {X_test_v2.shape}")
print(f"Scale pos weight: {scale_pos_weight:.2f}")


Train shape: (15312, 25)
Val shape: (3828, 25)
Test shape: (2904, 25)
Scale pos weight: 3.48


In [45]:
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV

best_model = XGBClassifier(
    subsample=0.9,
    reg_lambda=1.5,
    reg_alpha=0,
    n_estimators=500,
    min_child_weight=3,
    max_depth=3,
    learning_rate=0.05,
    gamma=0.3,
    colsample_bytree=0.7,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    eval_metric='auc'
)

best_model.fit(X_train_v2, y_train_v2, verbose=False)

y_pred_xgb = best_model.predict_proba(X_val_v2)[:, 1]
auc_xgb = roc_auc_score(y_val_v2, y_pred_xgb)

print(f"XGBoost ROC-AUC: {auc_xgb:.4f}")


XGBoost ROC-AUC: 0.7791


In [46]:
param_distributions_lgb = {
    'num_leaves': [15, 31, 63, 127],
    'max_depth': [3, 5, 7, 10, -1],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'n_estimators': [100, 200, 300, 500],
    'min_child_samples': [5, 10, 20, 30],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.5, 1, 2]
}

base_lgb = lgb.LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

random_search_lgb = RandomizedSearchCV(
    estimator=base_lgb,
    param_distributions=param_distributions_lgb,
    n_iter=50,
    scoring='roc_auc',
    cv=cv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)


random_search_lgb.fit(X_full_v2, y_full_v2)

print(f"\n Meilleur score CV: {random_search_lgb.best_score_:.4f}")
for param, value in random_search_lgb.best_params_.items():
    print(f"   {param:20s}: {value}")


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_samples=30, n_estimators=200, num_leaves=63, reg_alpha=1, reg_lambda=0, subsample=1.0; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_samples=30, n_estimators=200, num_leaves=63, reg_alpha=1, reg_lambda=0, subsample=1.0; total time=   2.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_samples=30, n_estimators=200, num_leaves=63, reg_alpha=1, reg_lambda=0, subsample=1.0; total time=   2.1s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_samples=30, n_estimators=200, num_leaves=63, reg_alpha=1, reg_lambda=0, subsample=1.0; total time=   2.2s
[CV] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, min_child_samples=30, n_estimators=200, num_leaves=63, reg_alpha=1, reg_lambda=0, subsample=1.0; total time=   2.4s
[CV] END colsample_bytree=0.8, learning_rat

In [47]:
print(f"\n Meilleur score CV LightGBM: {random_search_lgb.best_score_:.4f}")
print(f"   XGBoost optimisé:            0.7951")
print(f"   Différence:                  {(random_search_lgb.best_score_ - 0.7951):.4f}")


for param, value in random_search_lgb.best_params_.items():
    print(f"   {param:20s}: {value}")

best_lgb = random_search_lgb.best_estimator_



 Meilleur score CV LightGBM: 0.7942
   XGBoost optimisé:            0.7951
   Différence:                  -0.0009
   subsample           : 1.0
   reg_lambda          : 2
   reg_alpha           : 1
   num_leaves          : 15
   n_estimators        : 500
   min_child_samples   : 20
   max_depth           : 7
   learning_rate       : 0.03
   colsample_bytree    : 0.7


In [48]:
from sklearn.ensemble import RandomForestClassifier


rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    verbose=0
)

rf_model.fit(X_train_v2, y_train_v2)

y_pred_rf = rf_model.predict(X_val_v2)
y_pred_proba_rf = rf_model.predict_proba(X_val_v2)[:, 1]

auc_rf = roc_auc_score(y_val_v2, y_pred_proba_rf)

print(f"\n ROC-AUC Random Forest: {auc_rf:.4f}")
print(classification_report(y_val_v2, y_pred_rf, target_names=['Non-Churn', 'Churn']))



 ROC-AUC Random Forest: 0.7615
              precision    recall  f1-score   support

   Non-Churn       0.86      0.82      0.84      2974
       Churn       0.46      0.53      0.49       854

    accuracy                           0.76      3828
   macro avg       0.66      0.68      0.67      3828
weighted avg       0.77      0.76      0.76      3828



In [49]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression


base_learners = [
    ('xgb', best_model),          
    ('lgb', best_lgb), 
    ('rf', rf_model) 
]

meta_learner = LogisticRegression(
    max_iter=1000,
    random_state=42,
    class_weight='balanced'
)

stacking_model = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba',
    n_jobs=-1,
    verbose=0
)

stacking_model.fit(X_train_v2, y_train_v2)

y_pred_stack = stacking_model.predict(X_val_v2)
y_pred_proba_stack = stacking_model.predict_proba(X_val_v2)[:, 1]

auc_stack = roc_auc_score(y_val_v2, y_pred_proba_stack)

print(f"   STACKING:         {auc_stack:.4f}")
print(f"\n   Amélioration:     {(auc_stack - 0.7951):.4f}")

print(classification_report(y_val_v2, y_pred_stack, target_names=['Non-Churn', 'Churn']))


   STACKING:         0.7794

   Amélioration:     -0.0157
              precision    recall  f1-score   support

   Non-Churn       0.88      0.74      0.80      2974
       Churn       0.42      0.66      0.51       854

    accuracy                           0.72      3828
   macro avg       0.65      0.70      0.66      3828
weighted avg       0.78      0.72      0.74      3828



In [50]:

y_pred_proba_xgb_val = best_model.predict_proba(X_val_v2)[:, 1]
y_pred_proba_lgb_val = best_lgb.predict_proba(X_val_v2)[:, 1]
y_pred_proba_rf_val = rf_model.predict_proba(X_val_v2)[:, 1]

best_auc_blend = 0
best_weights = None

print(f"\n{'XGB':>5} {'LGB':>5} {'RF':>5} {'AUC':>8}")
print("-" * 30)

for w_xgb in np.arange(0.3, 0.8, 0.05):
    for w_lgb in np.arange(0.1, 0.6, 0.05):
        w_rf = 1.0 - w_xgb - w_lgb
        if w_rf < 0 or w_rf > 0.5:
            continue

        y_blend = (w_xgb * y_pred_proba_xgb_val +
                   w_lgb * y_pred_proba_lgb_val +
                   w_rf * y_pred_proba_rf_val)

        auc_blend = roc_auc_score(y_val_v2, y_blend)

        if auc_blend > best_auc_blend:
            best_auc_blend = auc_blend
            best_weights = (w_xgb, w_lgb, w_rf)

print(f"   XGBoost:   {best_weights[0]:.2f}")
print(f"   LightGBM:  {best_weights[1]:.2f}")
print(f"   RF:        {best_weights[2]:.2f}")
print(f"\n   ROC-AUC:   {best_auc_blend:.4f}")
print(f"   vs XGB seul: {(best_auc_blend - 0.7951):.4f}")



  XGB   LGB    RF      AUC
------------------------------
   XGBoost:   0.30
   LightGBM:  0.55
   RF:        0.15

   ROC-AUC:   0.8348
   vs XGB seul: 0.0397


In [52]:
final_xgb = XGBClassifier(
    subsample=0.9, reg_lambda=1.5, reg_alpha=0, n_estimators=500,
    min_child_weight=3, max_depth=3, learning_rate=0.05, gamma=0.3,
    colsample_bytree=0.7, scale_pos_weight=scale_pos_weight,
    random_state=42, eval_metric='auc'
)
final_xgb.fit(X_full_v2, y_full_v2, verbose=False)

final_lgb = lgb.LGBMClassifier(
    **random_search_lgb.best_params_,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)
final_lgb.fit(X_full_v2, y_full_v2)

final_rf = RandomForestClassifier(
    n_estimators=300, max_depth=10, min_samples_split=20,
    min_samples_leaf=10, max_features='sqrt', class_weight='balanced',
    random_state=42, n_jobs=-1, verbose=0
)
final_rf.fit(X_full_v2, y_full_v2)


pred_xgb_test = final_xgb.predict_proba(X_test_v2)[:, 1]
pred_lgb_test = final_lgb.predict_proba(X_test_v2)[:, 1]
pred_rf_test = final_rf.predict_proba(X_test_v2)[:, 1]

w_xgb, w_lgb, w_rf = best_weights
pred_blend = (w_xgb * pred_xgb_test +
              w_lgb * pred_lgb_test +
              w_rf * pred_rf_test)

pred_blend_binary = (pred_blend > 0.5).astype(int)

submission_blend = pd.DataFrame({
    'id': test_features_encoded['userId'],
    'prediction': pred_blend_binary
})

submission_blend.to_csv('submission_3.csv', index=False)


print(submission_blend['prediction'].value_counts())
print(f"\n   Taux de churn prédit: {submission_blend['prediction'].mean() * 100:.2f}%")
print(f"   (Train: {y_full_v2.mean() * 100:.2f}%)")
print(f"   Poids: XGB={w_xgb:.2f}, LGB={w_lgb:.2f}, RF={w_rf:.2f}")
print(f"   ROC-AUC CV: {best_auc_blend:.4f}")


prediction
0    2189
1     715
Name: count, dtype: int64

   Taux de churn prédit: 24.62%
   (Train: 22.31%)
   Poids: XGB=0.30, LGB=0.55, RF=0.15
   ROC-AUC CV: 0.8348


## Score Kaggle : 0.58