# 01 — Data Exploration

Explore your FASTA datasets before training.

In [None]:
import sys
sys.path.insert(0, "../src")
from fasta_utils import load_fasta, compute_sequence_stats
from data_loading import build_dataset
from utils import load_config

config = load_config("../configs/default.yaml")

In [None]:
# Set your paths here
PATH_A = "../data/class_a.fasta"
PATH_B = "../data/class_b.fasta"

dataset = build_dataset(PATH_A, PATH_B, config)
print(dataset)

In [None]:
stats = dataset.summary()
for name, s in stats.items():
    print(f"
{name}:")
    for k, v in s.items():
        print(f"  {k}: {v:.2f}" if isinstance(v, float) else f"  {k}: {v}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

lengths_a = [len(r) for r, l in zip(dataset.records, dataset.labels) if l == 0]
lengths_b = [len(r) for r, l in zip(dataset.records, dataset.labels) if l == 1]

fig, ax = plt.subplots(figsize=(9, 4))
ax.hist(lengths_a, bins=40, alpha=0.6, label=dataset.label_names[0], color="steelblue")
ax.hist(lengths_b, bins=40, alpha=0.6, label=dataset.label_names[1], color="coral")
ax.set_xlabel("Sequence Length")
ax.set_ylabel("Count")
ax.set_title("Sequence Length Distribution")
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Amino acid composition
from feature_extractors import CompositionExtractor, AMINO_ACIDS
import pandas as pd

ext = CompositionExtractor()
X = ext.fit_transform(dataset.sequences)
df = pd.DataFrame(X, columns=AMINO_ACIDS)
df["label"] = dataset.labels
comps = df.groupby("label").mean()
print(comps.T.to_string())