In [None]:
import pandas as pd
import spacy
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from utils import load_nli_data
from sklearn.manifold import TSNE
import numpy as np
import cupy as cp


spacy.prefer_gpu()
exclude_pipelines = [
    "parser",
    "tagger",
    "ner",
    "textcat",
    "lemmatizer",
    "attribute_ruler",
    "tok2vec",
]

nlp = spacy.load(
    "en_core_web_lg",
    exclude=exclude_pipelines,
)

In [None]:
snli = load_nli_data("../data/snli_1.0_test.jsonl")

snli.head()

# 1- Token Size Analysis per Label

In [None]:
# Use spaCy's batch processing to get token counts efficiently
print("Processing sentences to get token counts...")

# Process premises
docs1 = list(
    tqdm(
        nlp.pipe(snli["sentence1"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence1",
        total=len(snli),
    )
)

# Process hypotheses
docs2 = list(
    tqdm(
        nlp.pipe(snli["sentence2"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence2",
        total=len(snli),
    )
)

# Add token counts to dataframe
snli["sentence1_token_count"] = [len(doc) for doc in docs1]
snli["sentence2_token_count"] = [len(doc) for doc in docs2]

# Display summary statistics
token_stats = snli.groupby("gold_label")[
    ["sentence1_token_count", "sentence2_token_count"]
].describe()

# Create a more readable summary table
summary = pd.DataFrame()
for label in snli["gold_label"].unique():
    label_data = snli[snli["gold_label"] == label]
    for col in ["sentence1_token_count", "sentence2_token_count"]:
        col_name = "Premise" if col == "sentence1_token_count" else "Hypothesis"
        stats = label_data[col].describe()
        summary = pd.concat(
            [
                summary,
                pd.DataFrame(
                    {
                        "Label": label,
                        "Type": col_name,
                        "Mean": stats["mean"],
                        "Std": stats["std"],
                        "Min": stats["min"],
                        "Max": stats["max"],
                    },
                    index=[0],
                ),
            ],
            ignore_index=True,
        )

# Display the cleaner summary table
display(
    summary.pivot(index="Label", columns="Type")[["Mean", "Std", "Min", "Max"]]
    .style.background_gradient(cmap="viridis")
    .set_caption("Token Count Statistics by Label")
)

* Create box plots for each label to visualize the distribution of token sizes.

In [None]:
# Create visualizations for token count distributions
plt.figure(figsize=(14, 6))

# Plot token count distributions by label
plt.subplot(1, 2, 1)
sns.boxplot(x="gold_label", y="sentence1_token_count", data=snli)
plt.title("Premise Token Counts by Label")
plt.xlabel("Label")
plt.ylabel("Token Count")

plt.subplot(1, 2, 2)
sns.boxplot(x="gold_label", y="sentence2_token_count", data=snli)
plt.title("Hypothesis Token Counts by Label")
plt.xlabel("Label")
plt.ylabel("Token Count")

plt.tight_layout()
plt.savefig("data/token_counts_by_label.png", dpi=300, bbox_inches="tight")
plt.show()

# 2-Similarity Analysis per Label
* This function aims to compute the similarity of each label in the dataset.

In [None]:
# This uses Spacy's en_core_web_lg embeddings model for NLP pipeline
# and computes the similarity between sentence1 and sentence2
# using the vectors of the sentences.
# The similarity is computed using the cosine similarity of the vectors

docs1 = list(
    tqdm(
        nlp.pipe(snli["sentence1"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence1",
        total=len(snli),
    )
)

docs2 = list(
    tqdm(
        nlp.pipe(snli["sentence2"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence2",
        total=len(snli),
    )
)

# Compute similarities using vectors
snli["similarity"] = [
    doc1.similarity(doc2)
    for doc1, doc2 in tqdm(
        zip(docs1, docs2), desc="Computing similarities", total=len(snli)
    )
]

snli.head()

* Boxplot is used to visualize the distribution of similarity scores for each label.

In [None]:
# Create boxplot

plt.figure(figsize=(10, 6))
sns.boxplot(x="gold_label", y="similarity", data=snli)
plt.title("Similarity Scores by NLI Label")
plt.xlabel("Label")
plt.ylabel("Cosine Similarity")
plt.grid(True, linestyle="--", alpha=0.7)
plt.savefig("data/similarity_by_label.png", dpi=300, bbox_inches="tight")
plt.show()

* Histogram is used to visualize the distribution of similarity scores across all labels.

In [None]:
# Create histograms with KDE
plt.figure(figsize=(12, 6))
for label, color in zip(
    ["entailment", "contradiction", "neutral"], ["#66b3ff", "#ff9999", "#99ff99"]
):
    subset = snli[snli["gold_label"] == label]
    sns.histplot(
        subset["similarity"], label=label, alpha=0.6, color=color, bins=30, kde=True
    )

plt.title("Distribution of Similarity Scores by Label")
plt.xlabel("Similarity Score")
plt.ylabel("Frequency")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.7)
plt.savefig("similarity_distributions.png", dpi=300, bbox_inches="tight")
plt.show()

# 3- Lexical Overlap per label

In [None]:
def lexical_overlap_by_label(
    df, text_col1="sentence1", text_col2="sentence2", label_col="gold_label"
):
    """
    Analyzes the lexical overlap between pairs of text using Jaccard similarity.
    
    This function computes the Jaccard similarity coefficient between the token sets
    of two text columns, grouped by a label column. Jaccard similarity measures the
    size of the intersection divided by the size of the union of two sets.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing the text columns and label column
    text_col1 : str, default="sentence1"
        Column name for the first text (premise in NLI tasks)
    text_col2 : str, default="sentence2"
        Column name for the second text (hypothesis in NLI tasks)
    label_col : str, default="gold_label"
        Column name for the labels
        
    Returns:
    --------
    pandas.DataFrame
        DataFrame containing the label and jaccard similarity scores
        
    Notes:
    ------
    Higher Jaccard scores indicate greater word overlap between the texts.
    In NLI tasks, this can reveal patterns like entailment pairs having 
    higher lexical overlap than contradiction pairs.
    """
    overlaps, labels = [], []
    for s1, s2, lbl in tqdm(
        zip(df[text_col1], df[text_col2], df[label_col]),
        total=len(df),
        desc="Lexical Overlap",
    ):
        d1, d2 = nlp(s1), nlp(s2)
        t1 = {tok.text.lower() for tok in d1}
        t2 = {tok.text.lower() for tok in d2}
        score = len(t1 & t2) / len(t1 | t2) if t1 | t2 else 0.0
        overlaps.append(score)
        labels.append(lbl)

    out = pd.DataFrame({label_col: labels, "jaccard": overlaps})
    # summary stats
    display(out.groupby(label_col)["jaccard"].describe().T)
    # boxplot
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=label_col, y="jaccard", data=out)
    plt.title("Lexical Overlap (Jaccard) by Label")
    plt.show()
    return out

In [None]:
lexical_overlap_by_label(snli)

# 4-Embeddings Space Visualization

In [None]:
def embedding_tsne_pairwise_by_label(
    df,
    text_cols=("sentence1", "sentence2"),
    label_col="gold_label",
    batch_size=128,
    random_state=42,
):
    """
    Visualize sentence pairs in embedding space using t-SNE dimensionality reduction.
    
    This function creates relationship vectors that capture the semantic connection between 
    sentence pairs, then projects them to 2D space for visualization. For each sentence pair, 
    a composite vector is created by concatenating:
      - The premise vector (v1)
      - The hypothesis vector (v2) 
      - The absolute difference between vectors (|v1-v2|)
      - The element-wise product of vectors (v1*v2)
    
    This rich representation captures both the individual sentence semantics and their relationship,
    which is useful for natural language inference tasks.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame containing text pairs and labels
    text_cols : tuple of str, default=("sentence1", "sentence2")
        Column names for the text pairs to compare
    label_col : str, default="gold_label"
        Column name for the categorical labels
    batch_size : int, default=128
        Batch size for efficient document processing
    random_state : int, default=42
        Random seed for t-SNE to ensure reproducibility
        
    Returns:
    --------
    None
        Displays a scatter plot of the sentence pair embeddings projected to 2D space
        
    Notes:
    ------
    The composite representation approach is inspired by InferSent (Conneau et al., 2017)
    and helps the model capture relationships between text pairs rather than just 
    individual sentence meanings.
    """
    docs1 = list(
        tqdm(
            nlp.pipe(df[text_cols[0]].tolist(), batch_size=batch_size, n_process=-1),
            total=len(df),
            desc=f"Encoding {text_cols[0]}",
        )
    )
    docs2 = list(
        tqdm(
            nlp.pipe(df[text_cols[1]].tolist(), batch_size=batch_size, n_process=-1),
            total=len(df),
            desc=f"Encoding {text_cols[1]}",
        )
    )

    rel_vecs = []
    for d1, d2 in zip(docs1, docs2):
        v1 = d1.vector
        v2 = d2.vector
        rel = np.concatenate(
            [
                v1,  # premise
                v2,  # hypothesis
                np.abs(v1 - v2),  # difference
                v1 * v2,  # elementwise product
            ]
        )
        rel_vecs.append(rel)
    X = np.vstack(rel_vecs)  # shape (n_pairs, 4*vector_dim)

    if isinstance(X, cp.ndarray):
        X = cp.asnumpy(X)

    labels = df[label_col].values

    X2d = TSNE(
        n_components=2,
        random_state=random_state,
        init="random",  # often helps reproducibility
        learning_rate="auto",
    ).fit_transform(X)

    emb_df = pd.DataFrame({"x": X2d[:, 0], "y": X2d[:, 1], label_col: labels})
    plt.figure(figsize=(8, 8))
    sns.scatterplot(
        data=emb_df, x="x", y="y", hue=label_col, palette="tab10", s=15, alpha=0.7
    )
    plt.title("t-SNE of spaCy Relationship Embeddings")
    plt.legend(loc="best")
    plt.tight_layout()
    plt.show()

In [None]:
embedding_tsne_pairwise_by_label(snli)