This notebook builds a feature-engineered machine-learning pipeline for the Jigsaw rules-violation task. The pipeline works by first useing a selection of SentenceTransformer encoder to create embeddings for every text field, then we build text / similarity / and categorical features. WE then train several classification models and evaluates them with cross-validation.

# Imports

In [36]:
import numpy as np
import os
import pickle
import pandas as pd
import torch
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from sklearn.preprocessing import LabelEncoder

In [33]:
from func import embed_batch, load_or_create_embeddings, extract_text_features, combine_features

In [34]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
SEED = 42
np.random.seed(SEED)

### Load the data

In [11]:
rule_col = "rule"
body_col = "body"  
label_col = "rule_violation"

In [12]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")  
solution_df = pd.read_csv("data/solution.csv")

### Device / Encoder Setup

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [40]:
# encoder_name = 'e5-large-v2-triplet' # custum fine tuned triplet model
encoder_name = 'all-mpnet-base-v2' # 768-dim, better quality
# encoder_name = 'all-MiniLM-L12-v2' # 384-dim, middle ground
# encoder_name = 'paraphrase-multilingual-mpnet-base-v2' # Specialized for semantic similarity
# encoder_name = 'sentence-transformers/all-roberta-large-v1' # Very large (if you have memory)

encoder = SentenceTransformer(encoder_name) 

In [41]:
le = LabelEncoder()
all_subreddits = pd.concat([train_df['subreddit'], test_df['subreddit']])
le.fit(all_subreddits)

train_subreddit_encoded = le.transform(train_df['subreddit'])
test_subreddit_encoded = le.transform(test_df['subreddit'])

print(f"Number of unique subreddits: {len(le.classes_)}")

Number of unique subreddits: 100


### Creating Embeddings

In [43]:
print("Embedding unique rules...")
all_unique_rules = pd.concat([train_df[rule_col], test_df[rule_col]]).unique()

unique_rule_emb = embed_batch(
    [f"passage: {rule}" for rule in all_unique_rules.tolist()],
    encoder
)

rule_to_emb = dict(zip(all_unique_rules, unique_rule_emb))
rule_emb = np.array([rule_to_emb[rule] for rule in train_df[rule_col]])

print("\nEmbedding train set")
train_emb = load_or_create_embeddings(
    train_df,
    prefix='train',
    encoder_name=encoder_name,
    encoder=encoder
)
body_emb = train_emb['body_emb']
pos1_emb = train_emb['pos1_emb']
pos2_emb = train_emb['pos2_emb']
neg1_emb = train_emb['neg1_emb']
neg2_emb = train_emb['neg2_emb']

rule_emb_test = np.array([rule_to_emb[rule] for rule in test_df[rule_col]])

print("\nEmbedding test set")
test_emb = load_or_create_embeddings(
    test_df,
    prefix='test',
    encoder_name=encoder_name,
    encoder=encoder
)
body_emb_test = test_emb['body_emb']
pos1_emb_test = test_emb['pos1_emb']
pos2_emb_test = test_emb['pos2_emb']
neg1_emb_test = test_emb['neg1_emb']
neg2_emb_test = test_emb['neg2_emb']


Embedding unique rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embedding train set
Loading cached train body...
Loading cached train pos1...
Loading cached train pos2...
Loading cached train neg1...
Loading cached train neg2...

Embedding test set
Loading cached test body...
Loading cached test pos1...
Loading cached test pos2...
Loading cached test neg1...
Loading cached test neg2...


### Building Features

In [45]:
print("Combining train features...")
train_features = []
for i in tqdm(range(len(train_df))):
    feat = combine_features(
        body_emb[i],
        rule_emb[i],
        pos1_emb[i],
        pos2_emb[i],
        neg1_emb[i],
        neg2_emb[i],
        train_df.iloc[i][body_col],
        train_subreddit_encoded[i]
    )
    train_features.append(feat)

train_embeddings = np.array(train_features)
print(f"Train feature shape: {train_embeddings.shape}")

print("Combining test features...")
test_features = []
for i in tqdm(range(len(test_df))):
    feat = combine_features(
        body_emb_test[i],
        rule_emb_test[i],
        pos1_emb_test[i],
        pos2_emb_test[i],
        neg1_emb_test[i],
        neg2_emb_test[i],
        test_df.iloc[i][body_col],
        test_subreddit_encoded[i]
    )
    test_features.append(feat)

test_embeddings = np.array(test_features)
print(f"Test feature shape: {test_embeddings.shape}")

Combining train features...


  0%|          | 0/2029 [00:00<?, ?it/s]

Train feature shape: (2029, 17)
Combining test features...


  0%|          | 0/54059 [00:00<?, ?it/s]

Test feature shape: (54059, 17)


### Loading Models

In [46]:
models = {
    'Logistic': LogisticRegression(
    max_iter=5000,
    solver="lbfgs",
    n_jobs=-1,
),

    'RandomForest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=SEED, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=SEED, n_jobs=-1),
    'LightGBM': lgb.LGBMClassifier(n_estimators=100, max_depth=4, learning_rate=0.1, random_state=SEED, n_jobs=-1)
}

print(f"Models to train: {list(models.keys())}")

Models to train: ['Logistic', 'RandomForest', 'XGBoost', 'LightGBM']


In [47]:
y_test = solution_df.set_index("row_id").loc[test_df["row_id"]]["rule_violation"].values

### Main Training Loop

In [48]:
results = {}
n_folds = 3

X = train_embeddings
T = test_embeddings
y = train_df["rule_violation"].values

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}")
    print('='*60)

    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(T))
    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X[train_idx], X[val_idx]
        y_tr, y_val = y[train_idx], y[val_idx]

        # Train
        model.fit(X_tr, y_tr)

        # Predict
        val_pred = model.predict_proba(X_val)[:, 1]
        oof_preds[val_idx] = val_pred

        test_pred = model.predict_proba(T)[:, 1]
        test_preds += test_pred / n_folds

        # Fold metric
        fold_auc = roc_auc_score(y_val, val_pred)
        fold_scores.append(fold_auc)
        print(f"  Fold {fold+1}: Val AUC = {fold_auc:.4f}")

    # Full CV AUC
    cv_auc = roc_auc_score(y, oof_preds)

    # Test AUC (using solution_df labels)
    test_auc = roc_auc_score(y_test, test_preds)

    results[name] = {
        "oof": oof_preds,
        "test": test_preds,
        "cv_auc": cv_auc,
        "test_auc": test_auc,
        "fold_scores": fold_scores
    }

    print(f"  CV AUC:   {cv_auc:.4f}")
    print(f"  Test AUC: {test_auc:.4f}")

print("\n✅ All models trained!")



Training Logistic
  Fold 1: Val AUC = 0.7257
  Fold 2: Val AUC = 0.7022
  Fold 3: Val AUC = 0.7121
  CV AUC:   0.7134
  Test AUC: 0.7218

Training RandomForest
  Fold 1: Val AUC = 0.7435
  Fold 2: Val AUC = 0.7307
  Fold 3: Val AUC = 0.7557
  CV AUC:   0.7426
  Test AUC: 0.7207

Training XGBoost
  Fold 1: Val AUC = 0.7581
  Fold 2: Val AUC = 0.7265
  Fold 3: Val AUC = 0.7455
  CV AUC:   0.7433
  Test AUC: 0.7046

Training LightGBM
[LightGBM] [Info] Number of positive: 687, number of negative: 665
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000210 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2796
[LightGBM] [Info] Number of data points in the train set: 1352, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.508136 -> initscore=0.032547
[LightGBM] [Info] Start training from score 0.032547
  Fold 1: Val AUC = 0.7567
[LightGBM] [Info] Number of positive: 688, number 



### Results

In [49]:
print("\n" + "="*70)
print("Model Comparison")
print("="*70)

for name, res in results.items():
    mean_val = np.mean(res["fold_scores"])
    std_val  = np.std(res["fold_scores"])
    cv_auc   = res["cv_auc"]
    test_auc = res["test_auc"]

    print(
        f"{name:15s} "
        f"CV: {cv_auc:.4f} | "
        f"Val Mean: {mean_val:.4f} ± {std_val:.4f} | "
        f"Test: {test_auc:.4f}"
    )

print("="*70)


Model Comparison
Logistic        CV: 0.7134 | Val Mean: 0.7133 ± 0.0096 | Test: 0.7218
RandomForest    CV: 0.7426 | Val Mean: 0.7433 ± 0.0102 | Test: 0.7207
XGBoost         CV: 0.7433 | Val Mean: 0.7434 ± 0.0130 | Test: 0.7046
LightGBM        CV: 0.7463 | Val Mean: 0.7467 ± 0.0073 | Test: 0.7028


In [51]:
print("\n" + "="*70)
print("Creating Ensemble")
print("="*70)

oof_ensemble = np.mean([res["oof"] for res in results.values()], axis=0)
test_ensemble = np.mean([res["test"] for res in results.values()], axis=0)

ensemble_cv_auc = roc_auc_score(y, oof_ensemble)
ensemble_test_auc = roc_auc_score(y_test, test_ensemble)

print(f"\nEnsemble CV AUC:   {ensemble_cv_auc:.4f}")
print(f"Ensemble Test AUC: {ensemble_test_auc:.4f}")

print("\nModel Comparison:")
for name, res in results.items():
    print(
        f"  {name:15s} "
        f"CV: {res['cv_auc']:.4f} | "
        f"Test: {res['test_auc']:.4f}"
    )

print(f"  {'Ensemble':15s} "
      f"CV: {ensemble_cv_auc:.4f} | "
      f"Test: {ensemble_test_auc:.4f}")

print("="*70)


Creating Ensemble

Ensemble CV AUC:   0.7513
Ensemble Test AUC: 0.7227

Model Comparison:
  Logistic        CV: 0.7134 | Test: 0.7218
  RandomForest    CV: 0.7426 | Test: 0.7207
  XGBoost         CV: 0.7433 | Test: 0.7046
  LightGBM        CV: 0.7463 | Test: 0.7028
  Ensemble        CV: 0.7513 | Test: 0.7227


### Saving Sumbissions

In [54]:
os.makedirs("submissions", exist_ok=True)

print("\nSaving submissions...\n")

for name, res in results.items():
    df_sub = test_df[["row_id"]].copy()
    df_sub["prediction"] = res["test"]

    out_path = f"submissions/submission_{name}.csv"
    df_sub.to_csv(out_path, index=False)

    print(f"Saved: {out_path}")

df_ens = test_df[["row_id"]].copy()
df_ens["prediction"] = test_ensemble

ens_path = "submissions/submission_ensemble.csv"
df_ens.to_csv(ens_path, index=False)

print(f"Saved: {ens_path}")

print("\nAll submissions saved.")



Saving submissions...

Saved: submissions/submission_Logistic.csv
Saved: submissions/submission_RandomForest.csv
Saved: submissions/submission_XGBoost.csv
Saved: submissions/submission_LightGBM.csv
Saved: submissions/submission_ensemble.csv

All submissions saved.
