# üéØ Simple Ensemble Methods 


In [2]:
# Imports
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

SEED = 42
np.random.seed(SEED)

print("‚úÖ Imports successful")

‚úÖ Imports successful


In [3]:
# Load data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

print(f"Train: {len(train_df):,}")
print(f"Test: {len(test_df):,}")
print(f"\nClass distribution:")
print(train_df['rule_violation'].value_counts())

Train: 2,029
Test: 54,059

Class distribution:
rule_violation
1    1031
0     998
Name: count, dtype: int64


In [4]:
# Create embeddings
print("Loading encoder...")
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

print("Encoding train...")
X_train = encoder.encode(train_df['body'].tolist(), show_progress_bar=True)
y_train = train_df['rule_violation'].values

print("Encoding test...")
X_test = encoder.encode(test_df['body'].tolist(), show_progress_bar=True)

print(f"\nFeature shape: {X_train.shape}")

Loading encoder...


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  _torch_pytree._register_pytree_node(


Encoding train...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 64/64 [00:29<00:00,  2.20it/s]


Encoding test...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1690/1690 [14:59<00:00,  1.88it/s]



Feature shape: (2029, 384)


In [5]:
# Define models
models = {
    'Logistic': LogisticRegression(max_iter=1000, C=1.0, random_state=SEED),
    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=SEED, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=SEED, n_jobs=-1),
    'LightGBM': lgb.LGBMClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=SEED, n_jobs=-1)
}

print(f"Models to train: {list(models.keys())}")

Models to train: ['Logistic', 'RandomForest', 'XGBoost', 'LightGBM']


In [6]:
# Train with CV
results = {}
n_folds = 3

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}")
    print('='*60)
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X_train))
    test_preds = np.zeros(len(X_test))
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        X_tr, X_val = X_train[train_idx], X_train[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        # Train
        model.fit(X_tr, y_tr)
        
        # Predict
        val_pred = model.predict_proba(X_val)[:, 1]
        oof_preds[val_idx] = val_pred
        
        test_pred = model.predict_proba(X_test)[:, 1]
        test_preds += test_pred / n_folds
        
        # Score
        fold_auc = roc_auc_score(y_val, val_pred)
        fold_scores.append(fold_auc)
        print(f"  Fold {fold+1}: AUC = {fold_auc:.4f}")
    
    # Overall score
    cv_auc = roc_auc_score(y_train, oof_preds)
    
    results[name] = {
        'oof': oof_preds,
        'test': test_preds,
        'cv_auc': cv_auc,
        'fold_scores': fold_scores
    }
    
    print(f"  CV AUC: {cv_auc:.4f}")

print("\n‚úÖ All models trained!")


Training Logistic
  Fold 1: AUC = 0.8214
  Fold 2: AUC = 0.8067
  Fold 3: AUC = 0.8043
  CV AUC: 0.8107

Training RandomForest
  Fold 1: AUC = 0.8011
  Fold 2: AUC = 0.7959
  Fold 3: AUC = 0.8006
  CV AUC: 0.7997

Training XGBoost
  Fold 1: AUC = 0.8105
  Fold 2: AUC = 0.8226
  Fold 3: AUC = 0.8177
  CV AUC: 0.8169

Training LightGBM
[LightGBM] [Info] Number of positive: 687, number of negative: 665
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015296 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97919
[LightGBM] [Info] Number of data points in the train set: 1352, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508136 -> initscore=0.032547
[LightGBM] [Info] Start training from score 0.032547
  Fold 1: AUC = 0.8138
[LightGBM] [Info] Number of positive: 688, number of negative: 665
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing wa

In [7]:
# Compare models
print("\n" + "="*60)
print("Model Comparison")
print("="*60)

for name, res in results.items():
    mean_auc = np.mean(res['fold_scores'])
    std_auc = np.std(res['fold_scores'])
    print(f"{name:15s} CV AUC: {res['cv_auc']:.4f} | Mean: {mean_auc:.4f} ¬± {std_auc:.4f}")

print("="*60)


Model Comparison
Logistic        CV AUC: 0.8107 | Mean: 0.8108 ¬± 0.0075
RandomForest    CV AUC: 0.7997 | Mean: 0.7992 ¬± 0.0024
XGBoost         CV AUC: 0.8169 | Mean: 0.8169 ¬± 0.0050
LightGBM        CV AUC: 0.8111 | Mean: 0.8109 ¬± 0.0067


In [8]:
# Simple averaging ensemble
print("\nCreating ensemble...")

oof_ensemble = np.mean([res['oof'] for res in results.values()], axis=0)
test_ensemble = np.mean([res['test'] for res in results.values()], axis=0)

ensemble_auc = roc_auc_score(y_train, oof_ensemble)

print(f"\nüèÜ Ensemble CV AUC: {ensemble_auc:.4f}")

# Compare
print("\nComparison:")
for name, res in results.items():
    print(f"  {name:15s}: {res['cv_auc']:.4f}")
print(f"  {'Ensemble':15s}: {ensemble_auc:.4f}")


Creating ensemble...

üèÜ Ensemble CV AUC: 0.8250

Comparison:
  Logistic       : 0.8107
  RandomForest   : 0.7997
  XGBoost        : 0.8169
  LightGBM       : 0.8111
  Ensemble       : 0.8250


In [9]:
# Create submission
Path('outputs').mkdir(exist_ok=True)

submission = pd.DataFrame({
    'row_id': test_df['row_id'],
    'rule_violation': test_ensemble
})

submission.to_csv('outputs/submission_nb3.csv', index=False)

print("\n‚úÖ Submission saved!")
print(f"\nüìä Statistics:")
print(f"   Min:  {test_ensemble.min():.4f}")
print(f"   Max:  {test_ensemble.max():.4f}")
print(f"   Mean: {test_ensemble.mean():.4f}")
print(f"\nüìÅ File: outputs/submission_nb3.csv")
print(submission.head())


‚úÖ Submission saved!

üìä Statistics:
   Min:  0.0178
   Max:  0.9601
   Mean: 0.4519

üìÅ File: outputs/submission_nb3.csv
   row_id  rule_violation
0    2029        0.470274
1    2030        0.433193
2    2031        0.247011
3    2032        0.621863
4    2033        0.740371
