# Feature Exploration Notebook

This notebook is for exploring the features generated by `build_features.py`. We will:
1.  Load the saved preprocessing transformers (`TfidfVectorizer`s and `OneHotEncoder`).
2.  Inspect the vocabularies of the N-gram vectorizers.
3.  Examine how a sample antibody sequence is transformed into a feature vector.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

import pandas as pd
import joblib
import sys
from pathlib import Path
import numpy as np

# Add the 'src' directory to the Python path to allow importing the config module
# This assumes the notebook is in the root of the project directory
if 'src' not in sys.path:
    sys.path.append('src')

import config

# --- Load Artefacts ---
# Load the three fitted transformers we created in the build_features script.
try:
    vectorizer_vh = joblib.load(config.ARTEFACTS_DIR / "vectorizer_vh.joblib")
    vectorizer_vl = joblib.load(config.ARTEFACTS_DIR / "vectorizer_vl.joblib")
    encoder_ohe = joblib.load(config.ARTEFACTS_DIR / "encoder_ohe.joblib")
    print("Successfully loaded all transformers.")
    print(f"VH Vocabulary Size: {len(vectorizer_vh.vocabulary_)}")
    print(f"VL Vocabulary Size: {len(vectorizer_vl.vocabulary_)}")
    print(f"OHE Categories: {encoder_ohe.categories_}")
except FileNotFoundError as e:
    print(f"Error loading transformers: {e}")
    print("Please run the `src/features/build_features.py` script first to generate the artefacts.")


Successfully loaded all transformers.
VH Vocabulary Size: 10000
VL Vocabulary Size: 10000
OHE Categories: [array(['IgG1', 'IgG2', 'IgG4'], dtype=object)]


## 1. Inspecting N-gram Vocabularies

Let's examine the N-grams that were selected for the vocabularies. We can look at the N-grams with the highest and lowest IDF (Inverse Document Frequency) scores.

*   **High IDF**: N-grams that are rare across all sequences. These are highly specific.
*   **Low IDF**: N-grams that are very common across all sequences.

We'll create a helper function to display the top and bottom N-grams by their IDF weight.

In [2]:
def inspect_vocabulary(vectorizer: TfidfVectorizer, top_n: int = 20):
    """
    Displays the N-grams with the highest and lowest IDF scores from a fitted TfidfVectorizer.
    """
    # Get the feature names (n-grams) and their IDF scores
    feature_names = vectorizer.get_feature_names_out()
    idf_scores = vectorizer.idf_

    # Create a DataFrame for easy sorting
    idf_df = pd.DataFrame({
        'ngram': feature_names,
        'idf_score': idf_scores
    }).sort_values(by='idf_score', ascending=False)

    print(f"--- Top {top_n} N-grams (Most Specific/Rare) ---")
    print(idf_df.head(top_n).to_string(index=False))
    
    print(f"\n--- Bottom {top_n} N-grams (Most Common) ---")
    print(idf_df.tail(top_n).to_string(index=False))

# Inspect the VH vocabulary
print("="*20 + " VH Vocabulary " + "="*20)
inspect_vocabulary(vectorizer_vh)

# Inspect the VL vocabulary
print("\n" + "="*20 + " VL Vocabulary " + "="*20)
inspect_vocabulary(vectorizer_vl)

NameError: name 'TfidfVectorizer' is not defined

## 2. Vectorizing a Sample Sequence

Now, let's see how a single antibody's heavy and light chains are converted into sparse vectors. We will:
1.  Load the original dataset to get a sample sequence.
2.  Use the `.transform()` method of our fitted vectorizers.
3.  Inspect the resulting sparse matrix to see which N-grams were found and what their TF-IDF scores are.

In [None]:
def display_vectorized_sequence(sequence: str, vectorizer: TfidfVectorizer):
    """
    Transforms a sequence and displays the non-zero elements of its vector.
    """
    # Transform the sequence. Note that transform expects an iterable.
    vector = vectorizer.transform([sequence])
    
    # Get feature names and find non-zero elements
    feature_names = vectorizer.get_feature_names_out()
    non_zero_indices = vector.nonzero()[1]
    
    if len(non_zero_indices) == 0:
        print("No N-grams from the vocabulary were found in this sequence.")
        return

    # Create a DataFrame of the results
    results = []
    for idx in non_zero_indices:
        ngram = feature_names[idx]
        tfidf_score = vector[0, idx]
        results.append({'ngram': ngram, 'tfidf_score': tfidf_score})
        
    result_df = pd.DataFrame(results).sort_values(by='tfidf_score', ascending=False)
    
    print(f"Original Sequence (first 60 chars): '{sequence[:60]}...'")
    print(f"Vector Shape: {vector.shape}")
    print(f"Number of non-zero N-grams found: {len(result_df)}")
    print("\n--- Found N-grams and their TF-IDF Scores ---")
    print(result_df.to_string(index=False))

# --- Load data and get a sample ---
try:
    df = pd.read_csv(config.DATA_FILE)
    # Take a sample from the middle of the dataframe
    sample_antibody = df.iloc[len(df) // 2]

    vh_sequence = sample_antibody[config.VH_SEQUENCE_COL]
    vl_sequence = sample_antibody[config.VL_SEQUENCE_COL]
    subtype = sample_antibody[config.HC_SUBTYPE_COL]

    # --- Vectorize and display VH sequence ---
    print("="*20 + " Vectorizing VH Sequence " + "="*20)
    display_vectorized_sequence(vh_sequence, vectorizer_vh)

    # --- Vectorize and display VL sequence ---
    print("\n" + "="*20 + " Vectorizing VL Sequence " + "="*20)
    display_vectorized_sequence(vl_sequence, vectorizer_vl)
    
    # --- Transform the subtype ---
    print("\n" + "="*20 + " Encoding Subtype " + "="*20)
    print(f"Original Subtype: {subtype}")
    subtype_vector = encoder_ohe.transform([[subtype]])
    print(f"Encoded Vector (sparse): {subtype_vector}")
    print(f"Feature names: {encoder_ohe.get_feature_names_out()}")


except FileNotFoundError:
    print(f"Could not load data file at {config.DATA_FILE} to get a sample.")
