In [1]:
%pip install -U datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np


In [3]:

# Load the model's tokenizer
model_name = "NousResearch/Llama-2-7b-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load dataset
dataset = load_dataset("DONG19/EffiBench")
train_data = dataset["train"]

# Initialize lists to store lengths
desc_lengths = []
soln_lengths = []
combined_lengths = []
    

In [8]:
# Process each example
for example in train_data:
    # Get raw text
    desc = example["markdown_description"]
    soln = example["canonical_solution"]
    combined = f"### Input:\n{desc}\n\n### Output:\n{soln}\n\n### End"
    
    # Tokenize without padding
    desc_tokens = tokenizer(desc, truncation=False, padding=False)
    soln_tokens = tokenizer(soln, truncation=False, padding=False)
    combined_tokens = tokenizer(combined, truncation=False, padding=False)
    
    # Store lengths
    desc_lengths.append(len(desc_tokens["input_ids"]))
    soln_lengths.append(len(soln_tokens["input_ids"]))
    combined_lengths.append(len(combined_tokens["input_ids"]))
    
    # Print extremely long examples for inspection
    if len(combined_tokens["input_ids"]) > 1000:
        print("\nFound very long example:")
        print(f"Description length: {len(desc_tokens['input_ids'])} tokens")
        print(f"Solution length: {len(soln_tokens['input_ids'])} tokens")
        print(f"Combined length: {len(combined_tokens['input_ids'])} tokens")
        print("\nDescription preview (first 100 chars):", desc[:100])
        print("Solution preview (first 100 chars):", soln[:100])



Found very long example:
Description length: 563 tokens
Solution length: 429 tokens
Combined length: 1006 tokens

Description preview (first 100 chars): 
A **transformation sequence** from word `beginWord` to word `endWord` using a dictionary `wordList`
Solution preview (first 100 chars): class Solution:
    def findLadders(
        self, beginWord: str, endWord: str, wordList: List[str]

Found very long example:
Description length: 752 tokens
Solution length: 252 tokens
Combined length: 1018 tokens

Description preview (first 100 chars): 
A city's **skyline** is the outer contour of the silhouette formed by all the buildings in that cit
Solution preview (first 100 chars): from queue import PriorityQueue


class Solution:
    def getSkyline(self, buildings: List[List[int]

Found very long example:
Description length: 1109 tokens
Solution length: 244 tokens
Combined length: 1367 tokens

Description preview (first 100 chars): 
Suppose we have a file system that stores both files and dir

In [9]:
# Calculate statistics
print("\nToken Length Statistics:")
print("\nDescription Lengths:")
print(f"Mean: {np.mean(desc_lengths):.1f}")
print(f"Median: {np.median(desc_lengths):.1f}")
print(f"Max: {np.max(desc_lengths)}")
print(f"95th percentile: {np.percentile(desc_lengths, 95):.1f}")

print("\nSolution Lengths:")
print(f"Mean: {np.mean(soln_lengths):.1f}")
print(f"Median: {np.median(soln_lengths):.1f}")
print(f"Max: {np.max(soln_lengths)}")
print(f"95th percentile: {np.percentile(soln_lengths, 95):.1f}")

print("\nCombined Lengths (with prompt format):")
print(f"Mean: {np.mean(combined_lengths):.1f}")
print(f"Median: {np.median(combined_lengths):.1f}")
print(f"Max: {np.max(combined_lengths)}")
print(f"95th percentile: {np.percentile(combined_lengths, 95):.1f}")

# Print length distribution
print("\nLength Distribution (Combined):")
percentiles = [50, 75, 90, 95, 99, 100]
for p in percentiles:
    print(f"{p}th percentile: {np.percentile(combined_lengths, p):.1f}")


Token Length Statistics:

Description Lengths:
Mean: 584.6
Median: 535.0
Max: 2274
95th percentile: 1058.0

Solution Lengths:
Mean: 165.3
Median: 148.0
Max: 948
95th percentile: 334.3

Combined Lengths (with prompt format):
Mean: 763.9
Median: 708.0
Max: 2561
95th percentile: 1344.1

Length Distribution (Combined):
50th percentile: 708.0
75th percentile: 918.2
90th percentile: 1147.1
95th percentile: 1344.1
99th percentile: 1653.0
100th percentile: 2561.0
