# Exploring the FollowBench Dataset

This notebook explores the **FollowBench** dataset from HuggingFace, which is designed to evaluate LLM instruction-following capabilities with varying constraint levels.

**Dataset:** `YuxinJiang/FollowBench`

**Key Fields:**
- `example_id`: Unique identifier for each test case
- `category`: Constraint type (content, situation, style, format, example)
- `source`: Source dataset or origin
- `instruction`: The prompt/instruction given to the model
- `level`: Constraint complexity level (1-5, where 5 is most complex)
- `target`: Expected constraint/target for evaluation

In [1]:
# Import required libraries
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

  from .autonotebook import tqdm as notebook_tqdm


## 1. Load the FollowBench Dataset

In [2]:
# Load the FollowBench dataset from HuggingFace
dataset = load_dataset("YuxinJiang/FollowBench", split="train")

print(f"Dataset size: {len(dataset)} samples")
print(f"\nDataset features: {dataset.features}")

Dataset size: 1852 samples

Dataset features: {'example_id': Value(dtype='int64', id=None), 'category': Value(dtype='string', id=None), 'source': Value(dtype='string', id=None), 'instruction': Value(dtype='string', id=None), 'level': Value(dtype='int64', id=None), 'target': Value(dtype='string', id=None)}


In [None]:
# Convert to pandas DataFrame for easier exploration
df = pd.DataFrame(dataset)

# Set display options to show full text
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,example_id,category,source,instruction,level,target
0,1,content,t0_zsnoopt_data,Pick one category for the following text. The ...,0,
1,1,content,t0_zsnoopt_data,Identify one category from the list below for ...,1,
2,1,content,t0_zsnoopt_data,Identify one category and the sentiment convey...,2,
3,1,content,t0_zsnoopt_data,Analyze the provided text to pinpoint a catego...,3,
4,1,content,t0_zsnoopt_data,Analyze the supplied text to discern a categor...,4,


## 2. Dataset Structure Overview

In [None]:
# Dataset info
print("DataFrame Info:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

In [None]:
# Basic statistics
df.describe(include='all')

## 3. Analyze Constraint Categories

FollowBench categorizes constraints into 5 types:
- **content**: Requirements about what information to include
- **situation**: Context or scenario requirements
- **style**: Writing style requirements (formal, casual, etc.)
- **format**: Structural requirements (lists, paragraphs, length)
- **example**: Requirements to follow given examples

In [None]:
# Category distribution
category_counts = df['category'].value_counts()
print("Category Distribution:")
print(category_counts)
print(f"\nTotal categories: {df['category'].nunique()}")

In [None]:
# Visualize category distribution
fig, ax = plt.subplots(figsize=(10, 6))
colors = sns.color_palette("husl", len(category_counts))
bars = ax.bar(category_counts.index, category_counts.values, color=colors)
ax.set_xlabel('Category', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Distribution of Constraint Categories', fontsize=14)
plt.xticks(rotation=45, ha='right')

# Add value labels on bars
for bar, count in zip(bars, category_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5, 
            str(count), ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

## 4. Examine Constraint Levels

FollowBench uses 5 constraint levels (1-5):
- **Level 1**: Single constraint (easiest)
- **Level 2**: Two constraints
- **Level 3**: Three constraints
- **Level 4**: Four constraints
- **Level 5**: Five constraints (hardest)

In [None]:
# Level distribution
level_counts = df['level'].value_counts().sort_index()
print("Level Distribution:")
print(level_counts)
print(f"\nLevel range: {df['level'].min()} to {df['level'].max()}")

In [None]:
# Visualize level distribution
fig, ax = plt.subplots(figsize=(8, 5))
colors = sns.color_palette("viridis", len(level_counts))
bars = ax.bar(level_counts.index.astype(str), level_counts.values, color=colors)
ax.set_xlabel('Constraint Level', fontsize=12)
ax.set_ylabel('Count', fontsize=12)
ax.set_title('Distribution of Constraint Levels', fontsize=14)

# Add value labels
for bar, count in zip(bars, level_counts.values):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 3, 
            str(count), ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

## 5. Category vs Level Analysis

In [None]:
# Cross-tabulation of category and level
cross_tab = pd.crosstab(df['category'], df['level'])
print("Category x Level Cross-tabulation:")
print(cross_tab)

In [None]:
# Heatmap of category vs level
fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(cross_tab, annot=True, fmt='d', cmap='YlOrRd', ax=ax)
ax.set_xlabel('Constraint Level', fontsize=12)
ax.set_ylabel('Category', fontsize=12)
ax.set_title('Samples per Category and Level', fontsize=14)
plt.tight_layout()
plt.show()

## 6. Source Analysis

In [None]:
# Source distribution
source_counts = df['source'].value_counts()
print("Source Distribution:")
print(source_counts)
print(f"\nTotal unique sources: {df['source'].nunique()}")

In [None]:
# Visualize source distribution (if there are multiple sources)
if len(source_counts) > 1:
    fig, ax = plt.subplots(figsize=(10, 6))
    colors = sns.color_palette("Set2", len(source_counts))
    ax.pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%', 
           colors=colors, startangle=90)
    ax.set_title('Distribution of Data Sources', fontsize=14)
    plt.tight_layout()
    plt.show()
else:
    print(f"Only one source: {source_counts.index[0]}")

## 7. Instruction Length Analysis

In [None]:
# Calculate instruction lengths (in characters and words)
df['instruction_chars'] = df['instruction'].str.len()
df['instruction_words'] = df['instruction'].str.split().str.len()

print("Instruction Length Statistics:")
print(f"\nCharacter count:")
print(df['instruction_chars'].describe())
print(f"\nWord count:")
print(df['instruction_words'].describe())

In [None]:
# Instruction length distribution by level
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Character length by level
sns.boxplot(x='level', y='instruction_chars', data=df, ax=axes[0], palette='viridis')
axes[0].set_xlabel('Constraint Level', fontsize=12)
axes[0].set_ylabel('Instruction Length (characters)', fontsize=12)
axes[0].set_title('Instruction Length by Constraint Level', fontsize=14)

# Word count by level
sns.boxplot(x='level', y='instruction_words', data=df, ax=axes[1], palette='viridis')
axes[1].set_xlabel('Constraint Level', fontsize=12)
axes[1].set_ylabel('Instruction Length (words)', fontsize=12)
axes[1].set_title('Word Count by Constraint Level', fontsize=14)

plt.tight_layout()
plt.show()

## 8. Sample Data Inspection

Let's look at some example instructions at different constraint levels.

In [None]:
# Show sample instructions for each level
for level in sorted(df['level'].unique()):
    print(f"\n{'='*80}")
    print(f"LEVEL {level} EXAMPLE:")
    print('='*80)
    
    sample = df[df['level'] == level].iloc[0]
    print(f"\nüìå Example ID: {sample['example_id']}")
    print(f"üìÇ Category: {sample['category']}")
    print(f"üìù Source: {sample['source']}")
    print(f"\nüìã INSTRUCTION:\n{sample['instruction'][:500]}...")
    if sample['target']:
        print(f"\nüéØ TARGET/CONSTRAINT:\n{sample['target'][:300]}...")

In [None]:
# View a complete sample from each category
for category in df['category'].unique():
    print(f"\n{'='*80}")
    print(f"CATEGORY: {category.upper()}")
    print('='*80)
    
    sample = df[df['category'] == category].iloc[0]
    print(f"\nüìå Example ID: {sample['example_id']}")
    print(f"üìä Level: {sample['level']}")
    print(f"\nüìã INSTRUCTION:\n{sample['instruction']}")
    print(f"\nüéØ TARGET:\n{sample['target']}")

## 9. Summary Statistics

In [None]:
# Final summary
print("=" * 60)
print("FOLLOWBENCH DATASET SUMMARY")
print("=" * 60)
print(f"\nüìä Total Samples: {len(df)}")
print(f"\nüìÅ Categories ({df['category'].nunique()}):")
for cat, count in category_counts.items():
    print(f"   - {cat}: {count} ({count/len(df)*100:.1f}%)")

print(f"\nüìà Levels ({df['level'].nunique()}):")
for level, count in level_counts.items():
    print(f"   - Level {level}: {count} ({count/len(df)*100:.1f}%)")

print(f"\nüìù Instruction Length:")
print(f"   - Average: {df['instruction_words'].mean():.1f} words")
print(f"   - Min: {df['instruction_words'].min()} words")
print(f"   - Max: {df['instruction_words'].max()} words")

print(f"\nüîó Sources: {df['source'].nunique()}")
print("=" * 60)