# Traditional ML (SVM Features) for Chinese NLI

Extract hand-crafted features and train classifiers (LogReg, SGD, SVM, MLP).

**Instructions:**
1. Upload `NNP.zip` to your Google Drive under `NNP.zip`
   - Create locally: `cd ~/Desktop/uni && zip -r NNP.zip NNP/ -x 'NNP/.venv/*' 'NNP/.git/*' 'NNP/results/*'`
2. Run all cells (GPU not required but speeds up jieba)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import zipfile, os, sys

ZIP_PATH = '/content/drive/MyDrive/NNP/NNP.zip'
LOCAL_DIR = '/content/NNP'

with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
    zf.extractall('/content')

os.chdir(LOCAL_DIR)
sys.path.insert(0, LOCAL_DIR)
print(f'Working directory: {os.getcwd()}')

In [None]:
!pip install -q jieba scikit-learn tqdm scipy

In [None]:
import time
import numpy as np
import pandas as pd

import config
from data_loader import load_and_split
from features import build_features
from models.svm import build_classifier, grid_search_svm
from evaluate import compute_metrics

## Load Data

In [None]:
print('Loading data...')
train, val, test, le = load_and_split()
label_names = list(le.classes_)
print(f'Train: {len(train)}  Val: {len(val)}  Test: {len(test)}  Classes: {len(label_names)}')

## Extract Features

In [None]:
USE_RADICALS = True   # set to False if data/radical_map.json is missing
USE_DEPENDENCY = False  # requires spaCy zh_core_web_sm

print('Extracting features...')
t0 = time.time()
X_train, X_val, X_test, _ = build_features(
    train['text'].tolist(),
    val['text'].tolist(),
    test['text'].tolist(),
    use_radicals=USE_RADICALS,
    use_dependency=USE_DEPENDENCY,
)
print(f'Feature extraction took {time.time() - t0:.1f}s')
print(f'Feature matrix shape: {X_train.shape}')

y_train = train['label'].values
y_val = val['label'].values
y_test = test['label'].values

## Train All Classifiers

In [None]:
CLASSIFIERS = ['logreg', 'sgd', 'svm', 'mlp']

rows = []
for name in CLASSIFIERS:
    print(f'\n{"=" * 50}')
    print(f'Training: {name}')
    print(f'{"=" * 50}')

    clf = build_classifier(name)
    t0 = time.time()
    clf.fit(X_train_raw, y_train)
    train_time = time.time() - t0
    print(f'Training took {train_time:.1f}s')

    val_metrics = compute_metrics(y_val, clf.predict(X_val_raw), label_names)
    test_metrics = compute_metrics(y_test, clf.predict(X_test_raw), label_names)

    row = {
        'model': name,
        'val_acc': val_metrics['accuracy'],
        'val_f1': val_metrics['macro_f1'],
        'val_wf1': val_metrics['weighted_f1'],
        'test_acc': test_metrics['accuracy'],
        'test_f1': test_metrics['macro_f1'],
        'test_wf1': test_metrics['weighted_f1'],
        'time_s': f'{train_time:.1f}',
    }
    rows.append(row)
    print(f"  val_acc={row['val_acc']:.4f}  val_f1={row['val_f1']:.4f}")
    print(f"  test_acc={row['test_acc']:.4f}  test_f1={row['test_f1']:.4f}")

## Results

In [None]:
results = pd.DataFrame(rows)
display(results.style.format({
    'val_acc': '{:.4f}', 'val_f1': '{:.4f}', 'val_wf1': '{:.4f}',
    'test_acc': '{:.4f}', 'test_f1': '{:.4f}', 'test_wf1': '{:.4f}',
}).set_caption('Traditional ML Results (hand-crafted features)'))

# Save CSV
config.RESULTS_DIR.mkdir(exist_ok=True)
csv_path = config.RESULTS_DIR / 'svm_results.csv'
results.to_csv(csv_path, index=False)
print(f'Saved {csv_path}')

In [None]:
# ── Bar chart ─────────────────────────────────────────────────────────
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 4))
x = range(len(results))
w = 0.35
ax.bar([i - w/2 for i in x], results['test_acc'], w, label='Test Accuracy')
ax.bar([i + w/2 for i in x], results['test_f1'], w, label='Test Macro-F1')
ax.set_xticks(x)
ax.set_xticklabels(results['model'])
ax.set_ylabel('Score')
ax.set_title('Traditional ML — Hand-crafted Features')
ax.legend()
for i, (acc, f1) in enumerate(zip(results['test_acc'], results['test_f1'])):
    ax.text(i - w/2, acc + 0.01, f'{acc:.3f}', ha='center', fontsize=8)
    ax.text(i + w/2, f1 + 0.01, f'{f1:.3f}', ha='center', fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# ── LaTeX table ───────────────────────────────────────────────────────
lines = [
    r'\begin{table}[htbp]',
    r'\centering',
    r'\caption{Traditional ML results (hand-crafted features).}',
    r'\label{tab:svm-results}',
    r'\begin{tabular}{lrrrr}',
    r'\toprule',
    r'\textbf{Model} & \textbf{Val Acc} & \textbf{Val F1} & \textbf{Test Acc} & \textbf{Test F1} \\',
    r'\midrule',
]
for _, r in results.iterrows():
    lines.append(
        f"{r['model']} & {r['val_acc']:.4f} & {r['val_f1']:.4f} & "
        f"{r['test_acc']:.4f} & {r['test_f1']:.4f} \\\\"
    )
lines += [r'\bottomrule', r'\end{tabular}', r'\end{table}']

tex = '\n'.join(lines)
print(tex)

tex_path = config.RESULTS_DIR / 'svm_results.tex'
tex_path.write_text(tex)
print(f'\nSaved {tex_path}')

In [None]:
# Copy results to Drive
from pathlib import Path
import shutil

drive_results = Path('/content/drive/MyDrive/NNP_results')
drive_results.mkdir(exist_ok=True)
for f in config.RESULTS_DIR.iterdir():
    shutil.copy2(f, drive_results / f.name)
print(f'Copied results to {drive_results}')

## Feature Ablation Study

Extract each of the 9 feature groups independently, then run 19 MLP trainings:
- 1 full baseline (all 9 groups)
- 9 leave-one-out (drop one group at a time, measure F1 drop)
- 9 individual (each group alone)

Pipeline per run: `hstack → TruncatedSVD(300) → StandardScaler → MLPClassifier(512, 256)`

In [None]:
from collections import OrderedDict
import json
import scipy.sparse as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

from features.ngrams import char_ngram_vectorizer, word_ngram_vectorizer
from features.pos_tags import extract_pos_features
from features.pos_ngrams import pos_ngram_vectorizer, pos_to_sequences
from features.function_words import extract_function_word_features
from features.particles import extract_particle_features
from features.discourse import extract_discourse_features
from features.lexical_richness import extract_lexical_richness_features
from features.segmentation import extract_segmentation_features

GROUPS = OrderedDict([
    ('char',   'Char n-grams'),
    ('word',   'Word n-grams'),
    ('pos',    'POS tags'),
    ('pos_ng', 'POS n-grams'),
    ('func',   'Function words'),
    ('part',   'Particles'),
    ('disc',   'Discourse'),
    ('lex',    'Lexical richness'),
    ('seg',    'Segmentation'),
])

train_texts = train['text'].tolist()
test_texts  = test['text'].tolist()

In [None]:
# ── Extract all 9 feature groups independently ───────────────────────
def _sparse(arr):
    return sp.csr_matrix(arr)

blocks = {}
print('Extracting 9 feature groups independently...')
t0 = time.time()

print('  [1/9] Character n-gram TF-IDF...')
vec = char_ngram_vectorizer()
blocks['char'] = (vec.fit_transform(train_texts), vec.transform(test_texts))

print('  [2/9] Word n-gram TF-IDF (jieba)...')
word_vec, jieba_tok = word_ngram_vectorizer()
jieba_tok.set_total(len(train_texts), desc='    fit (train)')
tr = word_vec.fit_transform(train_texts)
jieba_tok.close()
jieba_tok.set_total(len(test_texts), desc='    transform (test)')
te = word_vec.transform(test_texts)
jieba_tok.close()
blocks['word'] = (tr, te)

print('  [3/9] POS tag distributions...')
blocks['pos'] = (_sparse(extract_pos_features(train_texts)),
                 _sparse(extract_pos_features(test_texts)))

print('  [4/9] POS n-gram TF-IDF...')
train_seq = pos_to_sequences(train_texts, desc='    POS seq (train)')
test_seq  = pos_to_sequences(test_texts,  desc='    POS seq (test)')
pv = pos_ngram_vectorizer()
blocks['pos_ng'] = (pv.fit_transform(train_seq), pv.transform(test_seq))

print('  [5/9] Function word frequencies...')
blocks['func'] = (_sparse(extract_function_word_features(train_texts)),
                  _sparse(extract_function_word_features(test_texts)))

print('  [6/9] Particle context features...')
blocks['part'] = (_sparse(extract_particle_features(train_texts)),
                  _sparse(extract_particle_features(test_texts)))

print('  [7/9] Discourse connectives & sentence features...')
blocks['disc'] = (_sparse(extract_discourse_features(train_texts)),
                  _sparse(extract_discourse_features(test_texts)))

print('  [8/9] Lexical richness features...')
blocks['lex'] = (_sparse(extract_lexical_richness_features(train_texts)),
                 _sparse(extract_lexical_richness_features(test_texts)))

print('  [9/9] Segmentation-derived features...')
blocks['seg'] = (_sparse(extract_segmentation_features(train_texts)),
                 _sparse(extract_segmentation_features(test_texts)))

print(f'\nExtraction took {time.time() - t0:.1f}s')
for key, name in GROUPS.items():
    print(f'  {name:<20s} {blocks[key][0].shape[1]:>6d} dims')

In [None]:
# ── Helper: combine selected blocks and train MLP ────────────────────
def combine_blocks(blocks, keys):
    train_parts = [blocks[k][0] for k in keys]
    test_parts  = [blocks[k][1] for k in keys]
    return sp.hstack(train_parts, format='csr'), sp.hstack(test_parts, format='csr')

def train_and_predict(X_train, y_train, X_test, run_label=''):
    n_components = min(300, X_train.shape[1], X_train.shape[0])
    svd = TruncatedSVD(n_components=n_components, random_state=config.RANDOM_SEED)
    X_tr = svd.fit_transform(X_train)
    X_te = svd.transform(X_test)
    explained = svd.explained_variance_ratio_.sum()
    scaler = StandardScaler()
    X_tr = scaler.fit_transform(X_tr)
    X_te = scaler.transform(X_te)
    clf = MLPClassifier(hidden_layer_sizes=(512, 256), early_stopping=True,
                        random_state=config.RANDOM_SEED, max_iter=300)
    clf.fit(X_tr, y_train)
    print(f'  {run_label}: {X_train.shape[1]} feats -> {n_components} SVD '
          f'({explained:.1%} var), MLP epoch {clf.n_iter_}')
    return clf.predict(X_te)

# ── Run all 19 experiments ───────────────────────────────────────────
keys = list(GROUPS.keys())
ablation = {}

# Full baseline
print('== Full baseline (all 9 groups) ==')
X_tr, X_te = combine_blocks(blocks, keys)
y_pred = train_and_predict(X_tr, y_train, X_te, 'ALL')
m = compute_metrics(y_test, y_pred, label_names)
ablation['full'] = {'accuracy': m['accuracy'], 'macro_f1': m['macro_f1']}
print(f'  -> Acc={m["accuracy"]:.4f}  F1={m["macro_f1"]:.4f}')
full_f1 = m['macro_f1']

# Leave-one-out
print('\n== Leave-one-out ==')
loo = {}
for drop in keys:
    subset = [k for k in keys if k != drop]
    X_tr, X_te = combine_blocks(blocks, subset)
    y_pred = train_and_predict(X_tr, y_train, X_te, f'LOO-{drop}')
    m = compute_metrics(y_test, y_pred, label_names)
    delta = m['macro_f1'] - full_f1
    loo[drop] = {'accuracy': m['accuracy'], 'macro_f1': m['macro_f1'], 'delta_f1': delta}
    print(f'  -> Acc={m["accuracy"]:.4f}  F1={m["macro_f1"]:.4f}  dF1={delta:+.4f}')
ablation['loo'] = loo

# Individual
print('\n== Individual groups ==')
ind = {}
for key in keys:
    X_tr, X_te = combine_blocks(blocks, [key])
    y_pred = train_and_predict(X_tr, y_train, X_te, f'SOLO-{key}')
    m = compute_metrics(y_test, y_pred, label_names)
    ind[key] = {'accuracy': m['accuracy'], 'macro_f1': m['macro_f1']}
    print(f'  -> Acc={m["accuracy"]:.4f}  F1={m["macro_f1"]:.4f}')
ablation['individual'] = ind

In [None]:
# ── Results table ─────────────────────────────────────────────────────
abl_rows = []
for key, name in GROUPS.items():
    abl_rows.append({
        'Group': name,
        'Ind Acc':  ablation['individual'][key]['accuracy'],
        'Ind F1':   ablation['individual'][key]['macro_f1'],
        'LOO Acc':  ablation['loo'][key]['accuracy'],
        'LOO F1':   ablation['loo'][key]['macro_f1'],
        'dF1':      ablation['loo'][key]['delta_f1'],
    })

abl_df = pd.DataFrame(abl_rows)
print(f"Full baseline: Acc={ablation['full']['accuracy']:.4f}  "
      f"F1={ablation['full']['macro_f1']:.4f}\n")
display(abl_df.style.format({
    'Ind Acc': '{:.4f}', 'Ind F1': '{:.4f}',
    'LOO Acc': '{:.4f}', 'LOO F1': '{:.4f}', 'dF1': '{:+.4f}',
}).set_caption('Feature Group Ablation (MLP)'))

In [None]:
# ── Bar chart: LOO F1 drop per group ─────────────────────────────────
sorted_keys = sorted(ablation['loo'], key=lambda k: ablation['loo'][k]['delta_f1'])
names  = [GROUPS[k] for k in sorted_keys]
deltas = [ablation['loo'][k]['delta_f1'] for k in sorted_keys]

fig, ax = plt.subplots(figsize=(8, 5))
colors = ['#d62728' if d < 0 else '#2ca02c' for d in deltas]
y_pos = range(len(names))
ax.barh(y_pos, deltas, color=colors, edgecolor='none', height=0.6)
ax.set_yticks(y_pos)
ax.set_yticklabels(names, fontsize=9)
ax.set_xlabel('dF1 (drop from full model)')
ax.set_title('Feature Group Importance (Leave-One-Out F1 Drop)')
ax.axvline(0, color='black', linewidth=0.5)
for i, d in enumerate(deltas):
    ha = 'right' if d < 0 else 'left'
    offset = -0.001 if d < 0 else 0.001
    ax.text(d + offset, i, f'{d:+.4f}', va='center', ha=ha, fontsize=8)
fig.tight_layout()

path_png = config.RESULTS_DIR / 'ablation_feature_importance.png'
path_pdf = config.RESULTS_DIR / 'ablation_feature_importance.pdf'
fig.savefig(path_png, dpi=150, bbox_inches='tight')
fig.savefig(path_pdf, bbox_inches='tight')
plt.show()
print(f'Saved {path_png}')

In [None]:
# ── LaTeX table + JSON ────────────────────────────────────────────────
full_acc = ablation['full']['accuracy']
full_f1  = ablation['full']['macro_f1']

lines = [
    r'\begin{table}[htbp]',
    r'\centering',
    r'\caption{Feature group ablation study (MLP). '
    r'Full model: Acc=%.4f, F1=%.4f.}' % (full_acc, full_f1),
    r'\label{tab:feature-ablation}',
    r'\begin{tabular}{lrrrrr}',
    r'\toprule',
    r'\textbf{Feature Group} & \textbf{Ind.\ Acc} & \textbf{Ind.\ F1} '
    r'& \textbf{LOO Acc} & \textbf{LOO F1} & \textbf{$\Delta$F1} \\',
    r'\midrule',
]
for key, name in GROUPS.items():
    i = ablation['individual'][key]
    l = ablation['loo'][key]
    lines.append(
        f"{name} & {i['accuracy']:.4f} & {i['macro_f1']:.4f} "
        f"& {l['accuracy']:.4f} & {l['macro_f1']:.4f} "
        f"& {l['delta_f1']:+.4f} \\\\"
    )
lines += [r'\bottomrule', r'\end{tabular}', r'\end{table}']
tex = '\n'.join(lines)
print(tex)

tex_path = config.RESULTS_DIR / 'ablation_feature_importance.tex'
tex_path.write_text(tex)
print(f'\nSaved {tex_path}')

# Save JSON
json_path = config.RESULTS_DIR / 'ablation_results.json'
with open(json_path, 'w') as f:
    json.dump(ablation, f, indent=2)
print(f'Saved {json_path}')

In [None]:
# Copy updated results to Drive
for f in config.RESULTS_DIR.iterdir():
    shutil.copy2(f, drive_results / f.name)
print(f'Copied results to {drive_results}')