This notebook builds a multi layer perceptron network using our engineered text-rule similarity features.

### Imports

In [14]:
import os
import pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

In [15]:
from func import embed_batch, load_or_create_embeddings, extract_text_features, combine_features

In [16]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
rule_col = "rule"
body_col = "body"  
label_col = "rule_violation"

In [7]:
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f24d8f46c10>

### Device / Encoder Setup

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

Device: cuda


In [9]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
solution_df = pd.read_csv('data/solution.csv')

print(f"Train: {len(train_df):,}")
print(f"Test: {len(test_df):,}")

Train: 2,029
Test: 54,059


In [10]:
# encoder_name = './e5-large-v2-triplet' # custum fine tuned triplet model
encoder_name = 'all-mpnet-base-v2' # 768-dim, better quality
# encoder_name = 'all-MiniLM-L12-v2' # 384-dim, middle ground
# encoder_name = 'paraphrase-multilingual-mpnet-base-v2' # Specialized for semantic similarity
# encoder_name = 'sentence-transformers/all-roberta-large-v1' # Very large (if you have memory)

encoder = SentenceTransformer(encoder_name) 

In [17]:
le = LabelEncoder()
all_subreddits = pd.concat([train_df['subreddit'], test_df['subreddit']])
le.fit(all_subreddits)

train_subreddit_encoded = le.transform(train_df['subreddit'])
test_subreddit_encoded = le.transform(test_df['subreddit'])

print(f"Number of unique subreddits: {len(le.classes_)}")

Number of unique subreddits: 100


In [18]:
print("Embedding unique rules...")
all_unique_rules = pd.concat([train_df[rule_col], test_df[rule_col]]).unique()

unique_rule_emb = embed_batch(
    [f"passage: {rule}" for rule in all_unique_rules.tolist()],
    encoder
)

rule_to_emb = dict(zip(all_unique_rules, unique_rule_emb))

rule_emb = np.array([rule_to_emb[rule] for rule in train_df[rule_col]])

print("\nEmbedding train set")
train_emb = load_or_create_embeddings(
    train_df,
    prefix='train',
    encoder_name=encoder_name,
    encoder=encoder
)

body_emb     = train_emb['body_emb']
pos1_emb     = train_emb['pos1_emb']
pos2_emb     = train_emb['pos2_emb']
neg1_emb     = train_emb['neg1_emb']
neg2_emb     = train_emb['neg2_emb']

rule_emb_test = np.array([rule_to_emb[rule] for rule in test_df[rule_col]])

print("\nEmbedding test set")
test_emb = load_or_create_embeddings(
    test_df,
    prefix='test',
    encoder_name=encoder_name,
    encoder=encoder
)

body_emb_test = test_emb['body_emb']
pos1_emb_test = test_emb['pos1_emb']
pos2_emb_test = test_emb['pos2_emb']
neg1_emb_test = test_emb['neg1_emb']
neg2_emb_test = test_emb['neg2_emb']


Embedding unique rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embedding train set
Loading cached train body...
Loading cached train pos1...
Loading cached train pos2...
Loading cached train neg1...
Loading cached train neg2...

Embedding test set
Loading cached test body...
Loading cached test pos1...
Loading cached test pos2...
Loading cached test neg1...
Loading cached test neg2...


In [20]:
print("Combining train features...")
train_features = []
for i in tqdm(range(len(train_df))):
    feat = combine_features(
        body_emb[i],
        rule_emb[i],
        pos1_emb[i],
        pos2_emb[i],
        neg1_emb[i],
        neg2_emb[i],
        train_df.iloc[i][body_col],
        train_subreddit_encoded[i]
    )
    train_features.append(feat)

train_embeddings = np.array(train_features)
print(f"Train feature shape: {train_embeddings.shape}")

print("Combining test features...")
test_features = []
for i in tqdm(range(len(test_df))):
    feat = combine_features(
        body_emb_test[i],
        rule_emb_test[i],
        pos1_emb_test[i],
        pos2_emb_test[i],
        neg1_emb_test[i],
        neg2_emb_test[i],
        test_df.iloc[i][body_col],
        test_subreddit_encoded[i]
    )
    test_features.append(feat)

test_embeddings = np.array(test_features)
print(f"Test feature shape: {test_embeddings.shape}")

Combining train features...


  0%|          | 0/2029 [00:00<?, ?it/s]

Train feature shape: (2029, 17)
Combining test features...


  0%|          | 0/54059 [00:00<?, ?it/s]

Test feature shape: (54059, 17)


### MLP Architecture

In [22]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim=384):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.LayerNorm(256),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.2),

            nn.Linear(128, 1)
        )
    
    def forward(self, x):
        return self.network(x).squeeze()

In [24]:
input_dim = train_embeddings.shape[1]
model = MLPClassifier(input_dim=input_dim).to(device)
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {total_params:,}")

Trainable parameters: 38,401


### Validation Split

In [26]:
X_train, X_val, y_train, y_val = train_test_split(
    train_embeddings,
    train_df['rule_violation'].values,
    test_size=0.2,
    random_state=SEED,
    stratify=train_df['rule_violation']
)

print(f"Train: {len(X_train):,}")
print(f"Val: {len(X_val):,}")

Train: 1,623
Val: 406


### Main Trianing Loop

In [31]:
optimizer = Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

epochs = 6#10
batch_size = 64

model.train()
for epoch in range(epochs):
    total_loss = 0
    for i in range(0, len(X_train), batch_size):
        batch_X = torch.FloatTensor(X_train[i:i+batch_size]).to(device)
        batch_y = torch.FloatTensor(y_train[i:i+batch_size]).to(device)
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    model.eval()
    with torch.no_grad():
        val_preds = []
        for i in range(0, len(X_val), batch_size):
            batch_X = torch.FloatTensor(X_val[i:i+batch_size]).to(device)
            outputs = model(batch_X)
            val_preds.extend(torch.sigmoid(outputs).cpu().numpy())
    
    val_auc = roc_auc_score(y_val, val_preds)
    model.train()
    
    print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss/len(X_train)*batch_size:.4f} - Val AUC: {val_auc:.4f}")

Epoch 1/6 - Loss: 0.6743 - Val AUC: 0.6678
Epoch 2/6 - Loss: 0.6677 - Val AUC: 0.6649
Epoch 3/6 - Loss: 0.6686 - Val AUC: 0.6590
Epoch 4/6 - Loss: 0.6604 - Val AUC: 0.6602
Epoch 5/6 - Loss: 0.6601 - Val AUC: 0.6534
Epoch 6/6 - Loss: 0.6608 - Val AUC: 0.6508


### Evaluation

In [28]:
print("Predicting on test set...")

model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(test_embeddings, dtype=torch.float32, device=device)
    logits = model(X_test_tensor)
    test_probs = torch.sigmoid(logits).cpu().numpy().reshape(-1)

test_results = test_df[["row_id"]].copy()
test_results["prediction"] = test_probs

test_results = test_results.merge(
    solution_df[["row_id", "rule_violation"]],
    on="row_id",
    how="left"
)

y_test = test_results["rule_violation"].values

test_auc = roc_auc_score(y_test, test_probs)
print(f"Test AUC: {test_auc:.5f}")

Predicting on test set...
Test AUC: 0.60164


### Saving Submission

In [29]:
os.makedirs("submissions", exist_ok=True)

submission_path = "submissions/submission_mlp.csv"
test_results[["row_id", "prediction"]].to_csv(submission_path, index=False)

print(f"MLP submission saved to: {submission_path}")

MLP submission saved to: submissions/submission_mlp.csv


### Training Loop per Encoder

In [32]:
encoder_list = [
   './e5-large-v2-triplet',
    'all-MiniLM-L12-v2',
    'all-mpnet-base-v2',
    'paraphrase-multilingual-mpnet-base-v2',
    'sentence-transformers/all-roberta-large-v1',
]

results_mlp = {}
os.makedirs("submissions", exist_ok=True)

In [33]:
for encoder_name in encoder_list:

    print("\n" + "="*90)
    print(f"Running encoder: {encoder_name}")
    print("="*90)

    encoder = SentenceTransformer(encoder_name)

    print("Embedding unique rules...")
    unique_rules = pd.concat([train_df[rule_col], test_df[rule_col]]).unique()
    unique_rule_emb = embed_batch([f"passage: {r}" for r in unique_rules], encoder)
    rule_to_emb = dict(zip(unique_rules, unique_rule_emb))

    rule_emb_train = np.array([rule_to_emb[r] for r in train_df[rule_col]])
    rule_emb_test  = np.array([rule_to_emb[r] for r in test_df[rule_col]])

    print("\nEmbedding TRAIN...")
    train_emb = load_or_create_embeddings(train_df, "train", encoder_name, encoder)

    print("\nEmbedding TEST...")
    test_emb  = load_or_create_embeddings(test_df,  "test",  encoder_name, encoder)

    body_emb = train_emb["body_emb"]
    pos1_emb = train_emb["pos1_emb"]
    pos2_emb = train_emb["pos2_emb"]
    neg1_emb = train_emb["neg1_emb"]
    neg2_emb = train_emb["neg2_emb"]

    body_emb_test = test_emb["body_emb"]
    pos1_emb_test = test_emb["pos1_emb"]
    pos2_emb_test = test_emb["pos2_emb"]
    neg1_emb_test = test_emb["neg1_emb"]
    neg2_emb_test = test_emb["neg2_emb"]

    print("\nCombining TRAIN features...")
    train_features = []
    for i in tqdm(range(len(train_df))):
        train_features.append(
            combine_features(
                body_emb[i],
                rule_emb_train[i],
                pos1_emb[i],
                pos2_emb[i],
                neg1_emb[i],
                neg2_emb[i],
                train_df.iloc[i][body_col],
                train_subreddit_encoded[i]
            )
        )
    train_embeddings = np.array(train_features)

    print("Combining TEST features...")
    test_features = []
    for i in tqdm(range(len(test_df))):
        test_features.append(
            combine_features(
                body_emb_test[i],
                rule_emb_test[i],
                pos1_emb_test[i],
                pos2_emb_test[i],
                neg1_emb_test[i],
                neg2_emb_test[i],
                test_df.iloc[i][body_col],
                test_subreddit_encoded[i]
            )
        )
    test_embeddings = np.array(test_features)

    X_train, X_val, y_train_split, y_val_split = train_test_split(
        train_embeddings,
        train_df[label_col].values,
        test_size=0.2,
        random_state=SEED,
        stratify=train_df[label_col]
    )

    print("\nTraining MLP...")
    input_dim = X_train.shape[1]
    model = MLPClassifier(input_dim=input_dim).to(device)

    optimizer = Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()

    batch_size = 64
    epochs = 6

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for i in range(0, len(X_train), batch_size):
            bx = torch.FloatTensor(X_train[i:i+batch_size]).to(device)
            by = torch.FloatTensor(y_train_split[i:i+batch_size]).to(device)

            optimizer.zero_grad()
            out = model(bx)
            loss = criterion(out, by)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        model.eval()
        val_preds = []
        with torch.no_grad():
            for i in range(0, len(X_val), batch_size):
                bx = torch.FloatTensor(X_val[i:i+batch_size]).to(device)
                out = model(bx)
                val_preds.extend(torch.sigmoid(out).cpu().numpy())

        val_auc = roc_auc_score(y_val_split, val_preds)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {total_loss:.4f} | Val AUC: {val_auc:.4f}")

    print(f"Validation AUC: {val_auc:.5f}")

    print("\nPredicting on TEST...")
    model.eval()
    with torch.no_grad():
        logits = model(torch.FloatTensor(test_embeddings).to(device))
        test_probs = torch.sigmoid(logits).cpu().numpy().reshape(-1)

    test_results = pd.DataFrame({
        "row_id": test_df["row_id"],
        "prediction": test_probs
    })

    test_results = test_results.merge(
        solution_df[["row_id", "rule_violation"]],
        on="row_id",
        how="left"
    )

    test_auc = roc_auc_score(test_results["rule_violation"], test_results["prediction"])
    print(f"TEST AUC: {test_auc:.5f}")

    safe_name = encoder_name.replace("/", "_")
    out_path = f"submissions/submission_mlp_{safe_name}.csv"
    os.makedirs("submissions", exist_ok=True)

    test_results[["row_id", "prediction"]].to_csv(out_path, index=False)
    print(f"Saved → {out_path}")

    results_mlp[encoder_name] = {
        "val_auc": val_auc,
        "test_auc": test_auc
    }


Running encoder: ./e5-large-v2-triplet
Embedding unique rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embedding TRAIN...
Loading cached train body...
Loading cached train pos1...
Loading cached train pos2...
Loading cached train neg1...
Loading cached train neg2...

Embedding TEST...
Loading cached test body...
Loading cached test pos1...
Loading cached test pos2...
Loading cached test neg1...
Loading cached test neg2...

Combining TRAIN features...


  0%|          | 0/2029 [00:00<?, ?it/s]

Combining TEST features...


  0%|          | 0/54059 [00:00<?, ?it/s]


Training MLP...
Epoch 1/6 | Loss: 18.0473 | Val AUC: 0.7601
Epoch 2/6 | Loss: 16.8343 | Val AUC: 0.8532
Epoch 3/6 | Loss: 15.1072 | Val AUC: 0.9008
Epoch 4/6 | Loss: 13.0482 | Val AUC: 0.9222
Epoch 5/6 | Loss: 11.8672 | Val AUC: 0.9358
Epoch 6/6 | Loss: 11.1940 | Val AUC: 0.9372
Validation AUC: 0.93716

Predicting on TEST...
TEST AUC: 0.77084
Saved → submissions/submission_mlp_._e5-large-v2-triplet.csv

Running encoder: all-MiniLM-L12-v2
Embedding unique rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embedding TRAIN...
Loading cached train body...
Loading cached train pos1...
Loading cached train pos2...
Loading cached train neg1...
Loading cached train neg2...

Embedding TEST...
Loading cached test body...
Loading cached test pos1...
Loading cached test pos2...
Loading cached test neg1...
Loading cached test neg2...

Combining TRAIN features...


  0%|          | 0/2029 [00:00<?, ?it/s]

Combining TEST features...


  0%|          | 0/54059 [00:00<?, ?it/s]


Training MLP...
Epoch 1/6 | Loss: 18.1818 | Val AUC: 0.6481
Epoch 2/6 | Loss: 17.8405 | Val AUC: 0.6575
Epoch 3/6 | Loss: 17.6166 | Val AUC: 0.6672
Epoch 4/6 | Loss: 17.4945 | Val AUC: 0.6718
Epoch 5/6 | Loss: 17.3749 | Val AUC: 0.6750
Epoch 6/6 | Loss: 17.3584 | Val AUC: 0.6770
Validation AUC: 0.67704

Predicting on TEST...
TEST AUC: 0.60485
Saved → submissions/submission_mlp_all-MiniLM-L12-v2.csv

Running encoder: all-mpnet-base-v2
Embedding unique rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embedding TRAIN...
Loading cached train body...
Loading cached train pos1...
Loading cached train pos2...
Loading cached train neg1...
Loading cached train neg2...

Embedding TEST...
Loading cached test body...
Loading cached test pos1...
Loading cached test pos2...
Loading cached test neg1...
Loading cached test neg2...

Combining TRAIN features...


  0%|          | 0/2029 [00:00<?, ?it/s]

Combining TEST features...


  0%|          | 0/54059 [00:00<?, ?it/s]


Training MLP...
Epoch 1/6 | Loss: 18.1221 | Val AUC: 0.6263
Epoch 2/6 | Loss: 17.7965 | Val AUC: 0.6338
Epoch 3/6 | Loss: 17.7827 | Val AUC: 0.6448
Epoch 4/6 | Loss: 17.5377 | Val AUC: 0.6519
Epoch 5/6 | Loss: 17.4445 | Val AUC: 0.6585
Epoch 6/6 | Loss: 17.3091 | Val AUC: 0.6597
Validation AUC: 0.65973

Predicting on TEST...
TEST AUC: 0.59380
Saved → submissions/submission_mlp_all-mpnet-base-v2.csv

Running encoder: paraphrase-multilingual-mpnet-base-v2
Embedding unique rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embedding TRAIN...
Loading cached train body...
Loading cached train pos1...
Loading cached train pos2...
Loading cached train neg1...
Loading cached train neg2...

Embedding TEST...
Loading cached test body...
Loading cached test pos1...
Loading cached test pos2...
Loading cached test neg1...
Loading cached test neg2...

Combining TRAIN features...


  0%|          | 0/2029 [00:00<?, ?it/s]

Combining TEST features...


  0%|          | 0/54059 [00:00<?, ?it/s]


Training MLP...
Epoch 1/6 | Loss: 18.4477 | Val AUC: 0.6492
Epoch 2/6 | Loss: 17.8084 | Val AUC: 0.6478
Epoch 3/6 | Loss: 17.7809 | Val AUC: 0.6582
Epoch 4/6 | Loss: 17.7283 | Val AUC: 0.6644
Epoch 5/6 | Loss: 17.4395 | Val AUC: 0.6623
Epoch 6/6 | Loss: 17.4207 | Val AUC: 0.6581
Validation AUC: 0.65813

Predicting on TEST...
TEST AUC: 0.60039
Saved → submissions/submission_mlp_paraphrase-multilingual-mpnet-base-v2.csv

Running encoder: sentence-transformers/all-roberta-large-v1
Embedding unique rules...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embedding TRAIN...
Loading cached train body...
Loading cached train pos1...
Loading cached train pos2...
Loading cached train neg1...
Loading cached train neg2...

Embedding TEST...
Loading cached test body...
Loading cached test pos1...
Loading cached test pos2...
Loading cached test neg1...
Loading cached test neg2...

Combining TRAIN features...


  0%|          | 0/2029 [00:00<?, ?it/s]

Combining TEST features...


  0%|          | 0/54059 [00:00<?, ?it/s]


Training MLP...
Epoch 1/6 | Loss: 18.3385 | Val AUC: 0.6381
Epoch 2/6 | Loss: 17.8691 | Val AUC: 0.6449
Epoch 3/6 | Loss: 17.7179 | Val AUC: 0.6519
Epoch 4/6 | Loss: 17.6116 | Val AUC: 0.6615
Epoch 5/6 | Loss: 17.2910 | Val AUC: 0.6638
Epoch 6/6 | Loss: 17.3950 | Val AUC: 0.6682
Validation AUC: 0.66816

Predicting on TEST...
TEST AUC: 0.60845
Saved → submissions/submission_mlp_sentence-transformers_all-roberta-large-v1.csv
