# 05 — Fusion head
Load cached artifacts from previous steps and train a late-fusion or hybrid-fusion head.
**TODO**: verify file presence in `cache/` before training.


In [2]:
import numpy as np
import json
import pandas as pd
from pathlib import Path
import sys

sys.path.append(str(Path('..').resolve()))  # <-- make utils importable
from utils.paths import CACHE_DIR


CACHE_DIR.mkdir(parents=True, exist_ok=True)
print(f'Reading cache from: {CACHE_DIR.resolve()}')

_DEF_FILL = 0.0

LEX_PROBS_PATH = CACHE_DIR / 'lex_probs.parquet'
# LEX_PROBS_PATH = CACHE_DIR / 'lex_probs_lightgbm.parquet'  # Uncomment to use LightGBM export

AUDIO_PROBS_PATH = Path('../artifacts/phase1/audio_cnn/per_sample_probs.parquet')
if not AUDIO_PROBS_PATH.exists():
    AUDIO_PROBS_PATH = CACHE_DIR / 'audio_probs.parquet'

_MODAL_FILES = {
    'face': CACHE_DIR / 'face_probs_resnet50_gpu.parquet',
    'lex': LEX_PROBS_PATH,
    'emotion': CACHE_DIR / 'emotion_probs.parquet',
    'audio': AUDIO_PROBS_PATH,
}

_DROP_COLS = {'label', 'split', 'text', 'raw_text', 'file_path', 'path'}

def _maybe_read(path: Path) -> pd.DataFrame:
    if path.exists():
        return pd.read_parquet(path)
    print(f"[WARN] Missing cached modal features at {path}")
    return pd.DataFrame(columns=['sample_id'])


def _prepare_modal(df: pd.DataFrame, prefix: str, sample_ids: pd.Index) -> pd.DataFrame:
    if df.empty:
        return df
    df = df[df['sample_id'].isin(sample_ids)].copy()
    drop_cols = [c for c in _DROP_COLS if c in df.columns]
    if drop_cols:
        df = df.drop(columns=drop_cols)
    df = df.sort_values('sample_id').drop_duplicates(subset='sample_id', keep='last')
    rename_map = {c: f"{prefix}_{c}" for c in df.columns if c != 'sample_id' and not c.startswith(f"{prefix}_")}
    df = df.rename(columns=rename_map)
    return df


def load_feats(split: str = 'train'):
    split = split.lower()
    if split in {'val', 'validation'}:
        split = 'validation'
    text_path = CACHE_DIR / f"text_probs_{split}.parquet"
    emb_path = CACHE_DIR / f"text_emb_{split}.npy"
    if not text_path.exists():
        if split == 'test':
            print(f"[WARN] Missing text probabilities for split='{split}' at {text_path}; skipping.")
            return None, None
        raise FileNotFoundError(f"Missing text probabilities for split='{split}' at {text_path}")
    if not emb_path.exists():
        if split == 'test':
            print(f"[WARN] Missing text embeddings for split='{split}' at {emb_path}; skipping.")
            return None, None
        raise FileNotFoundError(f"Missing text embeddings for split='{split}' at {emb_path}")

    base = pd.read_parquet(text_path).copy()

    labels = base.set_index('sample_id')['label'].copy()
    drop_from_base = [c for c in ('split', 'text', 'raw_text') if c in base.columns]
    if drop_from_base:
        base = base.drop(columns=drop_from_base)
    features = base.set_index('sample_id').drop(columns=['label'])
    sample_ids = features.index

    for prefix, path in _MODAL_FILES.items():
        modal = _prepare_modal(_maybe_read(path), prefix, sample_ids)
        if modal.empty:
            continue
        modal = modal.set_index('sample_id')
        features = features.join(modal, how='left')

    features = features.fillna(_DEF_FILL)
    features['label'] = labels
    features = features.reset_index().rename(columns={'index': 'sample_id'})

    embeddings = np.load(emb_path)
    if len(features) != len(embeddings):
        raise ValueError(
            f"Embedding count ({len(embeddings)}) does not match dataframe rows ({len(features)})."
        )

    return features, embeddings


train_feats, train_emb = load_feats('train')
val_feats, val_emb = load_feats('validation')
test_feats, test_emb = load_feats('test')

if train_feats is None or val_feats is None:
    raise RuntimeError('Training and validation splits are required to train the fusion model.')

split_sizes = {
    'train_rows': len(train_feats),
    'val_rows': len(val_feats),
}
if test_feats is not None:
    split_sizes['test_rows'] = len(test_feats)
else:
    print('[WARN] Test split artifacts not found; fusion evaluation will skip test metrics.')

print(split_sizes)


Reading cache from: /Users/mixberries13/Desktop/is424/G2T2-emojis-to-emotions/cache
[WARN] Missing cached modal features at /Users/mixberries13/Desktop/is424/G2T2-emojis-to-emotions/cache/audio_probs.parquet
[WARN] Missing cached modal features at /Users/mixberries13/Desktop/is424/G2T2-emojis-to-emotions/cache/audio_probs.parquet
[WARN] Missing cached modal features at /Users/mixberries13/Desktop/is424/G2T2-emojis-to-emotions/cache/audio_probs.parquet
{'train_rows': 36601, 'val_rows': 4575, 'test_rows': 4576}


In [3]:
from pathlib import Path
import json
audio_metrics_path = Path('../artifacts/phase1/audio_cnn/validation_metrics.json')
if audio_metrics_path.exists():
    with audio_metrics_path.open() as f:
        audio_metrics = json.load(f)
    print('Loaded audio validation metrics:')
    print({key: audio_metrics[key] for key in ['average_val_accuracy', 'std_val_accuracy', 'average_clean_train_accuracy'] if key in audio_metrics})
else:
    print('[WARN] audio validation metrics not found at', audio_metrics_path)


[WARN] audio validation metrics not found at ../artifacts/phase1/audio_cnn/validation_metrics.json


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import joblib

BULLY_LABEL = 'not_cyberbullying'

numeric_cols = train_feats.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in numeric_cols if c != 'label']


def build_matrix(df, emb):
    modal = df[feature_cols].to_numpy(dtype='float32', copy=True)
    if emb is not None:
        emb = emb.astype('float32')
        if emb.ndim == 1:
            emb = emb[:, None]
        modal = np.concatenate([modal, emb], axis=1)
    return modal


X_train = build_matrix(train_feats, train_emb)
X_val = build_matrix(val_feats, val_emb)
X_test = build_matrix(test_feats, test_emb) if test_feats is not None and test_emb is not None else None

y_train = (train_feats['label'].astype(str) != BULLY_LABEL).to_numpy(dtype='int32')
y_val = (val_feats['label'].astype(str) != BULLY_LABEL).to_numpy(dtype='int32')
y_test = (test_feats['label'].astype(str) != BULLY_LABEL).to_numpy(dtype='int32') if test_feats is not None else None

scaler = StandardScaler(with_mean=False)
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test) if X_test is not None else None

fusion_clf = LogisticRegression(max_iter=1000, class_weight='balanced')
fusion_clf.fit(X_train_scaled, y_train)

val_pred = fusion_clf.predict(X_val_scaled)
val_prob = fusion_clf.predict_proba(X_val_scaled)[:, 1]

print('Validation metrics:')
print(classification_report(y_val, val_pred, target_names=['non_bullying', 'bullying']))
print('ROC-AUC (val):', roc_auc_score(y_val, val_prob))
print('Confusion matrix (val):')
print(confusion_matrix(y_val, val_pred))

test_pred = fusion_clf.predict(X_test_scaled) if X_test_scaled is not None else None
test_prob = fusion_clf.predict_proba(X_test_scaled)[:, 1] if X_test_scaled is not None else None

if test_pred is not None:
    print()
    print('Test metrics:')
    print(classification_report(y_test, test_pred, target_names=['non_bullying', 'bullying']))
    print('ROC-AUC (test):', roc_auc_score(y_test, test_prob))
    print('Confusion matrix (test):')
    print(confusion_matrix(y_test, test_pred))
else:
    print()
    print('[WARN] Test split unavailable; skipping test evaluation.')


Validation metrics:
              precision    recall  f1-score   support

non_bullying       0.62      0.75      0.68       784
    bullying       0.95      0.90      0.92      3791

    accuracy                           0.88      4575
   macro avg       0.78      0.83      0.80      4575
weighted avg       0.89      0.88      0.88      4575

ROC-AUC (val): 0.9252879402882228
Confusion matrix (val):
[[ 586  198]
 [ 366 3425]]

Test metrics:
              precision    recall  f1-score   support

non_bullying       0.60      0.74      0.66       785
    bullying       0.94      0.90      0.92      3791

    accuracy                           0.87      4576
   macro avg       0.77      0.82      0.79      4576
weighted avg       0.88      0.87      0.88      4576

ROC-AUC (test): 0.9288077528575052
Confusion matrix (test):
[[ 579  206]
 [ 382 3409]]


In [5]:
from pathlib import Path

fusion_artifact_dir = Path('../artifacts/phase1/fusion')
fusion_artifact_dir.mkdir(parents=True, exist_ok=True)

joblib.dump({'scaler': scaler, 'model': fusion_clf, 'feature_cols': feature_cols}, fusion_artifact_dir / 'fusion_logreg.joblib')
print(f"Saved fusion model to {fusion_artifact_dir / 'fusion_logreg.joblib'}")

val_results = {
    'roc_auc': float(roc_auc_score(y_val, val_prob)),
    'accuracy': float((val_pred == y_val).mean()),
}
metrics = {'validation': val_results}
if test_pred is not None and test_prob is not None:
    test_results = {
        'roc_auc': float(roc_auc_score(y_test, test_prob)),
        'accuracy': float((test_pred == y_test).mean()),
    }
    metrics['test'] = test_results
(fusion_artifact_dir / 'metrics.json').write_text(json.dumps(metrics, indent=2) + '')
if 'test' in metrics:
    print('Saved validation and test metrics.')
else:
    print('Saved validation metrics (test split unavailable).')


Saved fusion model to ../artifacts/phase1/fusion/fusion_logreg.joblib
Saved validation and test metrics.


In [6]:
# Modal attention fusion (TensorFlow)
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, roc_auc_score
import json

ATTN_DIM = 128
NUM_HEADS = 4
ATTN_DROPOUT = 0.1
DENSE_DROPOUT = 0.3

modal_prefixes = {
    'text_prob': lambda c: c.startswith('prob_'),
    'face': lambda c: c.startswith('face_'),
    'audio': lambda c: c.startswith('audio_'),
    'lex': lambda c: c.startswith('lex_'),
    'emotion': lambda c: c.startswith('emotion_'),
}

def make_modal_arrays(df, emb):
    arrays = {}
    for name, matcher in modal_prefixes.items():
        cols = [c for c in feature_cols if matcher(c)]
        if cols:
            arrays[name] = df[cols].to_numpy(dtype='float32')
    residual_cols = [
        c for c in feature_cols
        if not any(matcher(c) for matcher in modal_prefixes.values())
    ]
    if residual_cols:
        arrays['other'] = df[residual_cols].to_numpy(dtype='float32')
    if emb is not None:
        arrays['text_emb'] = emb.astype('float32')
    return arrays

train_modal = make_modal_arrays(train_feats, train_emb)
val_modal = make_modal_arrays(val_feats, val_emb)
test_modal = make_modal_arrays(test_feats, test_emb) if test_feats is not None and test_emb is not None else None

shared_modalities = [name for name in train_modal if name in val_modal and train_modal[name].shape[1] > 0]
if test_modal is not None:
    shared_modalities = [name for name in shared_modalities if name in test_modal]

train_inputs = {f"{name}_input": train_modal[name] for name in shared_modalities}
val_inputs = {f"{name}_input": val_modal[name] for name in shared_modalities}
if test_modal is not None:
    test_inputs = {f"{name}_input": test_modal[name] for name in shared_modalities}
else:
    test_inputs = None

inputs = []
modal_tokens = []
for name in shared_modalities:
    input_layer = layers.Input(shape=(train_modal[name].shape[1],), name=f"{name}_input")
    projected = layers.Dense(ATTN_DIM, activation='relu', name=f"{name}_proj")(input_layer)
    token = layers.Reshape((1, ATTN_DIM), name=f"{name}_token")(projected)
    inputs.append(input_layer)
    modal_tokens.append(token)

if not modal_tokens:
    raise RuntimeError('No modality features available for attention fusion.')

if len(modal_tokens) > 1:
    modal_sequence = layers.Concatenate(axis=1, name='modal_sequence')(modal_tokens)
else:
    modal_sequence = modal_tokens[0]

attn_output = layers.MultiHeadAttention(
    num_heads=NUM_HEADS,
    key_dim=ATTN_DIM // NUM_HEADS,
    dropout=ATTN_DROPOUT,
    name='modal_attention',
)(modal_sequence, modal_sequence)
residual = layers.Add(name='attn_residual')([modal_sequence, attn_output])
pooled = layers.GlobalAveragePooling1D(name='modal_pool')(residual)
x = layers.Dropout(DENSE_DROPOUT, name='pre_dense_dropout')(pooled)
x = layers.Dense(ATTN_DIM, activation='relu', name='fusion_dense')(x)
x = layers.Dropout(DENSE_DROPOUT, name='post_dense_dropout')(x)
output = layers.Dense(1, activation='sigmoid', name='bullying_prob')(x)

attention_model = keras.Model(inputs=inputs, outputs=output, name='modal_attention_fusion')
attention_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[keras.metrics.AUC(name='roc_auc'), 'accuracy'],
)

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_map = {int(c): float(w) for c, w in zip(np.unique(y_train), class_weights)}

callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_roc_auc',
        mode='max',
        patience=5,
        restore_best_weights=True,
    )
]

history = attention_model.fit(
    train_inputs,
    y_train,
    validation_data=(val_inputs, y_val),
    epochs=40,
    batch_size=128,
    class_weight=class_weight_map,
    callbacks=callbacks,
    verbose=2,
)

val_probs_attn = attention_model.predict(val_inputs, batch_size=256)
val_pred_attn = (val_probs_attn >= 0.5).astype('int32')
print('Validation metrics (modal attention):')
print(classification_report(y_val, val_pred_attn.ravel(), target_names=['non_bullying', 'bullying']))
print('ROC-AUC (val):', roc_auc_score(y_val, val_probs_attn.ravel()))

if test_inputs is not None:
    test_probs_attn = attention_model.predict(test_inputs, batch_size=256)
    test_pred_attn = (test_probs_attn >= 0.5).astype('int32')
    print('\nTest metrics (modal attention):')
    print(classification_report(y_test, test_pred_attn.ravel(), target_names=['non_bullying', 'bullying']))
    print('ROC-AUC (test):', roc_auc_score(y_test, test_probs_attn.ravel()))
else:
    test_probs_attn = None

attention_dir = fusion_artifact_dir / 'modal_attention'
attention_dir.mkdir(parents=True, exist_ok=True)
attention_model.save(attention_dir / 'fusion_modal_attention.keras')
np.save(attention_dir / 'val_probabilities.npy', val_probs_attn.ravel())
if test_probs_attn is not None:
    np.save(attention_dir / 'test_probabilities.npy', test_probs_attn.ravel())

metrics = {
    'validation': {
        'roc_auc': float(roc_auc_score(y_val, val_probs_attn.ravel())),
        'accuracy': float((val_pred_attn.ravel() == y_val).mean()),
    }
}
if test_probs_attn is not None:
    metrics['test'] = {
        'roc_auc': float(roc_auc_score(y_test, test_probs_attn.ravel())),
        'accuracy': float((test_pred_attn.ravel() == y_test).mean()),
    }
(attention_dir / 'metrics.json').write_text(json.dumps(metrics, indent=2) + '')
print(f"Saved modal attention model to {attention_dir / 'fusion_modal_attention.keras'}")


2025-11-16 15:24:51.670625: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-11-16 15:24:51.670665: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-11-16 15:24:51.670668: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-11-16 15:24:51.670893: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-16 15:24:51.670910: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/40


2025-11-16 15:24:53.057677: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


286/286 - 17s - 60ms/step - accuracy: 0.9258 - loss: 0.1950 - roc_auc: 0.9769 - val_accuracy: 0.8745 - val_loss: 0.3338 - val_roc_auc: 0.9290
Epoch 2/40
286/286 - 12s - 44ms/step - accuracy: 0.9349 - loss: 0.1665 - roc_auc: 0.9824 - val_accuracy: 0.8798 - val_loss: 0.3480 - val_roc_auc: 0.9332
Epoch 3/40
286/286 - 12s - 43ms/step - accuracy: 0.9387 - loss: 0.1568 - roc_auc: 0.9844 - val_accuracy: 0.8667 - val_loss: 0.3543 - val_roc_auc: 0.9348
Epoch 4/40
286/286 - 12s - 43ms/step - accuracy: 0.9406 - loss: 0.1513 - roc_auc: 0.9855 - val_accuracy: 0.8785 - val_loss: 0.3270 - val_roc_auc: 0.9350
Epoch 5/40
286/286 - 12s - 43ms/step - accuracy: 0.9399 - loss: 0.1476 - roc_auc: 0.9863 - val_accuracy: 0.8739 - val_loss: 0.3380 - val_roc_auc: 0.9354
Epoch 6/40
286/286 - 13s - 44ms/step - accuracy: 0.9430 - loss: 0.1426 - roc_auc: 0.9872 - val_accuracy: 0.8726 - val_loss: 0.3425 - val_roc_auc: 0.9351
Epoch 7/40
286/286 - 13s - 44ms/step - accuracy: 0.9433 - loss: 0.1449 - roc_auc: 0.9867 - va