# 02 — Feature Engineering

Explore and visualise different feature representations.

In [None]:
import sys
sys.path.insert(0, "../src")
import numpy as np
import matplotlib.pyplot as plt
from feature_extractors import *
from data_loading import build_dataset
from utils import load_config

config = load_config("../configs/default.yaml")
dataset = build_dataset("../data/class_a.fasta", "../data/class_b.fasta", config)

In [None]:
# Compare feature extractors
pipeline = build_feature_pipeline(config)
X = pipeline.fit_transform(dataset.sequences)
print(f"Feature matrix shape: {X.shape}")
print(f"Feature names sample: {pipeline.feature_names[:10]}")

In [None]:
# Visualise k-mer features
kmer_ext = KmerExtractor(k=[3], normalize=True)
X_kmer = kmer_ext.fit_transform(dataset.sequences)
print(f"3-mer feature shape: {X_kmer.shape}")

# Mean k-mer per class
mask0 = dataset.labels == 0
mask1 = dataset.labels == 1
mean0 = X_kmer[mask0].mean(axis=0)
mean1 = X_kmer[mask1].mean(axis=0)

top_diff = np.argsort(np.abs(mean1 - mean0))[::-1][:20]
labels = [kmer_ext.feature_names[i] for i in top_diff]
fig, ax = plt.subplots(figsize=(12, 4))
ax.bar(range(20), (mean1 - mean0)[top_diff])
ax.set_xticks(range(20))
ax.set_xticklabels(labels, rotation=45, ha="right", fontsize=9)
ax.set_title("Top-20 Differentiating 3-mers (Class1 - Class0)")
plt.tight_layout()
plt.show()

In [None]:
# Physicochemical features
phys_ext = PhysicochemicalExtractor()
X_phys = phys_ext.fit_transform(dataset.sequences)
print("Physicochemical features per class:")
for i, name in dataset.label_names.items():
    print(f"  {name}: {X_phys[dataset.labels == i].mean(axis=0).round(3)}")

In [None]:
# PCA of full feature space
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

fig, ax = plt.subplots(figsize=(7, 5))
for lbl, name in dataset.label_names.items():
    mask = dataset.labels == lbl
    ax.scatter(X_pca[mask, 0], X_pca[mask, 1], label=name, alpha=0.6, s=15)
ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
ax.set_title("PCA of Feature Space")
ax.legend()
plt.tight_layout()
plt.show()