In [None]:
import json
import pandas as pd
import spacy
import numpy as np
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from utils import load_nli_data

spacy.prefer_gpu()
exclude_pipelines = [
    "parser",
    "tagger",
    "ner",
    "textcat",
    "lemmatizer",
    "attribute_ruler",
    "tok2vec",
]

nlp = spacy.load(
    "en_core_web_lg",
    exclude=exclude_pipelines,
)

In [None]:
snli = load_nli_data("data/snli_1.0_dev.jsonl")

snli.head()

# 1- Token Size Analysis per Label

In [None]:
# Use spaCy's batch processing to get token counts efficiently
print("Processing sentences to get token counts...")

# Process premises
docs1 = list(
    tqdm(
        nlp.pipe(snli["sentence1"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence1",
        total=len(snli),
    )
)

# Process hypotheses
docs2 = list(
    tqdm(
        nlp.pipe(snli["sentence2"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence2",
        total=len(snli),
    )
)

# Add token counts to dataframe
snli["sentence1_token_count"] = [len(doc) for doc in docs1]
snli["sentence2_token_count"] = [len(doc) for doc in docs2]

# Display summary statistics
token_stats = snli.groupby("gold_label")[["sentence1_token_count", "sentence2_token_count"]].describe()

# Create a more readable summary table
summary = pd.DataFrame()
for label in snli['gold_label'].unique():
    label_data = snli[snli['gold_label'] == label]
    for col in ['sentence1_token_count', 'sentence2_token_count']:
        col_name = 'Premise' if col == 'sentence1_token_count' else 'Hypothesis'
        stats = label_data[col].describe()
        summary = pd.concat([summary, pd.DataFrame({
            'Label': label,
            'Type': col_name,
            'Mean': stats['mean'],
            'Std': stats['std'],
            'Min': stats['min'],
            'Max': stats['max'],
        }, index=[0])], ignore_index=True)

# Display the cleaner summary table
display(summary.pivot(index='Label', columns='Type')[['Mean', 'Std', 'Min', 'Max']].style
        .background_gradient(cmap="viridis")
        .set_caption("Token Count Statistics by Label"))

* Create box plots for each label to visualize the distribution of token sizes.

In [None]:
# Create visualizations for token count distributions
plt.figure(figsize=(14, 6))

# Plot token count distributions by label
plt.subplot(1, 2, 1)
sns.boxplot(x='gold_label', y='sentence1_token_count', data=snli)
plt.title('Premise Token Counts by Label')
plt.xlabel('Label')
plt.ylabel('Token Count')

plt.subplot(1, 2, 2)
sns.boxplot(x='gold_label', y='sentence2_token_count', data=snli)
plt.title('Hypothesis Token Counts by Label')
plt.xlabel('Label')
plt.ylabel('Token Count')

plt.tight_layout()
plt.savefig('data/token_counts_by_label.png', dpi=300, bbox_inches='tight')
plt.show()

# 2-Similarity Analysis per Label
* This function aims to compute the similarity of each label in the dataset.

In [None]:
# This uses Spacy's en_core_web_lg embeddings model for NLP pipeline
# and computes the similarity between sentence1 and sentence2
# using the vectors of the sentences.
# The similarity is computed using the cosine similarity of the vectors

docs1 = list(
    tqdm(
        nlp.pipe(snli["sentence1"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence1",
        total=len(snli),
    )
)

docs2 = list(
    tqdm(
        nlp.pipe(snli["sentence2"].tolist(), batch_size=128, n_process=-1),
        desc="Processing sentence2",
        total=len(snli),
    )
)

# Compute similarities using vectors
snli["similarity"] = [
    doc1.similarity(doc2)
    for doc1, doc2 in tqdm(
        zip(docs1, docs2), desc="Computing similarities", total=len(snli)
    )
]

snli.head()

* Boxplot is used to visualize the distribution of similarity scores for each label.

In [None]:
# Create boxplot

plt.figure(figsize=(10, 6))
sns.boxplot(x='gold_label', y='similarity', data=snli)
plt.title('Similarity Scores by NLI Label')
plt.xlabel('Label')
plt.ylabel('Cosine Similarity')
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig('data/similarity_by_label.png', dpi=300, bbox_inches='tight')
plt.show()

* Histogram is used to visualize the distribution of similarity scores across all labels.

In [None]:
# Create histograms with KDE
plt.figure(figsize=(12, 6))
for label, color in zip(['entailment', 'contradiction', 'neutral'], 
                        ['#66b3ff', '#ff9999', '#99ff99']):
    subset = snli[snli['gold_label'] == label]
    sns.histplot(subset['similarity'], label=label, 
                alpha=0.6, color=color, bins=30, kde=True)

plt.title('Distribution of Similarity Scores by Label')
plt.xlabel('Similarity Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig('similarity_distributions.png', dpi=300, bbox_inches='tight')
plt.show()