# Traditional ML (SVM Features) for Chinese NLI

Extract hand-crafted features and train classifiers (LogReg, SGD, SVM, MLP).

**Instructions:**
1. Upload `NNP.zip` to your Google Drive under `NNP.zip`
   - Create locally: `cd ~/Desktop/uni && zip -r NNP.zip NNP/ -x 'NNP/.venv/*' 'NNP/.git/*' 'NNP/results/*'`
2. Run all cells (GPU not required but speeds up jieba)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import zipfile, os, sys

ZIP_PATH = '/content/drive/MyDrive/NNP.zip'
LOCAL_DIR = '/content/NNP'

if not os.path.exists(LOCAL_DIR + '/config.py'):
    print('Extracting project to local disk...')
    with zipfile.ZipFile(ZIP_PATH, 'r') as zf:
        zf.extractall('/content')
    print('Done.')
else:
    print('Already extracted.')

os.chdir(LOCAL_DIR)
sys.path.insert(0, LOCAL_DIR)
print(f'Working directory: {os.getcwd()}')

In [None]:
!pip install -q jieba scikit-learn tqdm scipy

In [None]:
import time
import numpy as np
import pandas as pd

import config
from data_loader import load_and_split
from features import build_features
from models.svm import build_classifier, grid_search_svm
from evaluate import compute_metrics

## Load Data

In [None]:
print('Loading data...')
train, val, test, le = load_and_split()
label_names = list(le.classes_)
print(f'Train: {len(train)}  Val: {len(val)}  Test: {len(test)}  Classes: {len(label_names)}')

## Extract Features

In [None]:
USE_RADICALS = True   # set to False if data/radical_map.json is missing
USE_DEPENDENCY = False  # requires spaCy zh_core_web_sm

print('Extracting features...')
t0 = time.time()
X_train_raw, X_val_raw, X_test_raw, _ = build_features(
    train['text'].tolist(),
    val['text'].tolist(),
    test['text'].tolist(),
    use_radicals=USE_RADICALS,
    use_dependency=USE_DEPENDENCY,
)
print(f'Feature extraction took {time.time() - t0:.1f}s')
print(f'Feature matrix shape: {X_train_raw.shape}')

y_train = train['label'].values
y_val = val['label'].values
y_test = test['label'].values

## Dimensionality Reduction (TruncatedSVD)

The raw TF-IDF features have ~120k dimensions. TruncatedSVD (like PCA but for sparse matrices) reduces this to a dense, lower-dimensional representation that trains much faster.

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler

USE_SVD = True
SVD_COMPONENTS = 300

if USE_SVD:
    print(f'Applying TruncatedSVD: {X_train_raw.shape[1]} → {SVD_COMPONENTS} dims...')
    t0 = time.time()
    svd = TruncatedSVD(n_components=SVD_COMPONENTS, random_state=42)
    scaler = StandardScaler()

    X_train = scaler.fit_transform(svd.fit_transform(X_train_raw))
    X_val = scaler.transform(svd.transform(X_val_raw))
    X_test = scaler.transform(svd.transform(X_test_raw))

    explained = svd.explained_variance_ratio_.sum()
    print(f'Variance retained: {explained:.1%}')
    print(f'SVD + scaling took {time.time() - t0:.1f}s')
    print(f'Reduced shape: {X_train.shape}')
else:
    X_train, X_val, X_test = X_train_raw, X_val_raw, X_test_raw
    print(f'Skipping SVD — using raw features: {X_train.shape}')

## Train All Classifiers

In [None]:
CLASSIFIERS = ['logreg', 'sgd', 'svm', 'mlp']

rows = []
for name in CLASSIFIERS:
    print(f'\n{"=" * 50}')
    print(f'Training: {name}')
    print(f'{"=" * 50}')

    clf = build_classifier(name)
    t0 = time.time()
    clf.fit(X_train, y_train)
    train_time = time.time() - t0
    print(f'Training took {train_time:.1f}s')

    val_metrics = compute_metrics(y_val, clf.predict(X_val), label_names)
    test_metrics = compute_metrics(y_test, clf.predict(X_test), label_names)

    row = {
        'model': name,
        'val_acc': val_metrics['accuracy'],
        'val_f1': val_metrics['macro_f1'],
        'val_wf1': val_metrics['weighted_f1'],
        'test_acc': test_metrics['accuracy'],
        'test_f1': test_metrics['macro_f1'],
        'test_wf1': test_metrics['weighted_f1'],
        'time_s': f'{train_time:.1f}',
    }
    rows.append(row)
    print(f"  val_acc={row['val_acc']:.4f}  val_f1={row['val_f1']:.4f}")
    print(f"  test_acc={row['test_acc']:.4f}  test_f1={row['test_f1']:.4f}")

## Results

In [None]:
results = pd.DataFrame(rows)
display(results.style.format({
    'val_acc': '{:.4f}', 'val_f1': '{:.4f}', 'val_wf1': '{:.4f}',
    'test_acc': '{:.4f}', 'test_f1': '{:.4f}', 'test_wf1': '{:.4f}',
}).set_caption('Traditional ML Results (hand-crafted features)'))

# Save CSV
config.RESULTS_DIR.mkdir(exist_ok=True)
csv_path = config.RESULTS_DIR / 'svm_results.csv'
results.to_csv(csv_path, index=False)
print(f'Saved {csv_path}')

In [None]:
# ── Bar chart ─────────────────────────────────────────────────────────
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 4))
x = range(len(results))
w = 0.35
ax.bar([i - w/2 for i in x], results['test_acc'], w, label='Test Accuracy')
ax.bar([i + w/2 for i in x], results['test_f1'], w, label='Test Macro-F1')
ax.set_xticks(x)
ax.set_xticklabels(results['model'])
ax.set_ylabel('Score')
ax.set_title('Traditional ML — Hand-crafted Features')
ax.legend()
for i, (acc, f1) in enumerate(zip(results['test_acc'], results['test_f1'])):
    ax.text(i - w/2, acc + 0.01, f'{acc:.3f}', ha='center', fontsize=8)
    ax.text(i + w/2, f1 + 0.01, f'{f1:.3f}', ha='center', fontsize=8)
plt.tight_layout()
plt.show()

In [None]:
# ── LaTeX table ───────────────────────────────────────────────────────
lines = [
    r'\begin{table}[htbp]',
    r'\centering',
    r'\caption{Traditional ML results (hand-crafted features).}',
    r'\label{tab:svm-results}',
    r'\begin{tabular}{lrrrr}',
    r'\toprule',
    r'\textbf{Model} & \textbf{Val Acc} & \textbf{Val F1} & \textbf{Test Acc} & \textbf{Test F1} \\',
    r'\midrule',
]
for _, r in results.iterrows():
    lines.append(
        f"{r['model']} & {r['val_acc']:.4f} & {r['val_f1']:.4f} & "
        f"{r['test_acc']:.4f} & {r['test_f1']:.4f} \\\\"
    )
lines += [r'\bottomrule', r'\end{tabular}', r'\end{table}']

tex = '\n'.join(lines)
print(tex)

tex_path = config.RESULTS_DIR / 'svm_results.tex'
tex_path.write_text(tex)
print(f'\nSaved {tex_path}')

In [None]:
# Copy results to Drive
from pathlib import Path
import shutil

drive_results = Path('/content/drive/MyDrive/NNP_results')
drive_results.mkdir(exist_ok=True)
for f in config.RESULTS_DIR.iterdir():
    shutil.copy2(f, drive_results / f.name)
print(f'Copied results to {drive_results}')