# Tokenization Analysis

This notebook analyzes the average sequence length after tokenization for different models and tokenizers.
This helps validate which models can receive different tokenization strategies.

In [1]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
from datasets import load_dataset
import logging

# Add parent directory to path
parent_dir = str(Path(".").resolve().parent) + "/scripts"
sys.path.append(str(parent_dir))

from preprocess import load_tokenizer, get_tokenize_function #NOQA
from models.fine_tuning_models.model_types_enum import ModelTypesEnum #NOQA

# Set up logging with a specific handler that we can control
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to clear logs
def clear_logs():
    """Clear all handlers to prevent log accumulation"""
    # Get all loggers
    loggers = [logging.getLogger(name) for name in logging.root.manager.loggerDict]
    loggers.append(logging.getLogger())  # Add root logger
    
    # Clear handlers for each logger
    for logger_instance in loggers:
        handlers = logger_instance.handlers[:]
        for handler in handlers:
            handler.close()
            logger_instance.removeHandler(handler)

## Configuration

In [2]:
# Configuration parameters
dataset_name = "kamel-usp/aes_enem_dataset"
dataset_split = "JBCS2025"
cache_dir = "/tmp/"

# Model configurations to test
model_configs = [
    {
        "model_type": ModelTypesEnum.PHI4_CLASSIFICATION_LORA.value,
        "base_model": "microsoft/phi-4",
        "name": "Phi-4",
        "use_full_context": False
    },
    {
        "model_type": ModelTypesEnum.PHI35_CLASSIFICATION_LORA.value,
        "base_model": "microsoft/Phi-3.5-mini-instruct",
        "name": "Phi-3.5",
        "use_full_context": False
    },
    {
        "model_type": ModelTypesEnum.LLAMA31_CLASSIFICATION_LORA.value,
        "base_model": "meta-llama/Llama-3.1-8B",
        "name": "Llama-3.1",
        "use_full_context": False
    },
    {
        "model_type": ModelTypesEnum.ENCODER_CLASSIFICATION.value,
        "base_model": "neuralmind/bert-base-portuguese-cased",
        "name": "Bertimbau Base",
        "use_full_context": False
    },
    {
        "model_type": ModelTypesEnum.ENCODER_CLASSIFICATION.value,
        "base_model": "google-bert/bert-base-multilingual-cased",
        "name": "Bert Multilingual",
        "use_full_context": False
    },
    {
        "model_type": ModelTypesEnum.PHI4_CLASSIFICATION_LORA.value,
        "base_model": "microsoft/phi-4",
        "name": "Phi-4-FullContext",
        "use_full_context": True
    },
    {
        "model_type": ModelTypesEnum.PHI35_CLASSIFICATION_LORA.value,
        "base_model": "microsoft/Phi-3.5-mini-instruct",
        "name": "Phi-3.5-FullContext",
        "use_full_context": True
    },
    {
        "model_type": ModelTypesEnum.LLAMA31_CLASSIFICATION_LORA.value,
        "base_model": "meta-llama/Llama-3.1-8B",
        "name": "Llama-3.1-FullContext",
        "use_full_context": True
    },

]

# Grade indices to test (0-4 for C1-C5)
grade_indices = [0, 1, 2, 3, 4]

## Load Dataset

In [3]:
# Load the dataset
dataset = load_dataset(
    dataset_name,
    dataset_split,
    cache_dir=cache_dir,
)

print(f"Dataset loaded with {len(dataset['train'])} training samples")
print(f"Dataset splits: {list(dataset.keys())}")

Dataset loaded with 500 training samples
Dataset splits: ['train', 'validation', 'test']


## Tokenization Analysis Functions

In [4]:
def analyze_tokenization(dataset, tokenizer, model_type, grade_index, text_column="essay_text", logger=None, use_full_context=False):
    """
    Analyze tokenization for a specific model and grade index.
    Returns statistics about sequence lengths.
    """
    # Get the tokenization function
    tokenize_function = get_tokenize_function(
        model_type=model_type,
        tokenizer=tokenizer,
        text_column=text_column,
        grade_index=grade_index,
        logger=logger,
        use_full_context=use_full_context
    )
    
    
    # Apply tokenization
    tokenized_sample = dataset['train'].map(tokenize_function, batched=True)
    
    # Extract sequence lengths
    sequence_lengths = []
    for example in tokenized_sample:
        if 'input_ids' in example:
            sequence_lengths.append(len(example['input_ids']))
    
    # Calculate statistics
    stats = {
        'mean': np.mean(sequence_lengths),
        'std': np.std(sequence_lengths),
        'min': np.min(sequence_lengths),
        'max': np.max(sequence_lengths),
        'median': np.median(sequence_lengths),
        'p95': np.percentile(sequence_lengths, 95),
        'p99': np.percentile(sequence_lengths, 99)
    }
    
    return stats, sequence_lengths

## Run Tokenization Analysis

In [5]:
# Store results
results = []
all_sequence_lengths = {}

for config in model_configs:
    model_type = config['model_type']
    base_model = config['base_model']
    model_name = config['name']
    use_full_context = config.get('use_full_context', False)
    
    print(f"\nAnalyzing {model_name} ({base_model})...")
    
    try:
    # Load tokenizer
        tokenizer = load_tokenizer(model_type, base_model, cache_dir)

        for grade_index in grade_indices:
            print(f"  Processing grade C{grade_index + 1}...")

            # Clear logs before processing
            clear_logs()

            # Re-setup logger after clearing
            logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
            logger = logging.getLogger(__name__)

            # Analyze tokenization
            stats, seq_lengths = analyze_tokenization(
                dataset=dataset,
                tokenizer=tokenizer,
                model_type=model_type,
                grade_index=grade_index,
                text_column="essay_text",
                logger=logger,
                use_full_context=use_full_context
            )

            # Store results
            result = {
                'model': model_name,
                'model_type': model_type,
                'grade': f'C{grade_index + 1}',
                **stats
            }
            results.append(result)

            # Store sequence lengths for visualization
            key = f"{model_name}_C{grade_index + 1}"
            all_sequence_lengths[key] = seq_lengths

    except Exception as e:
        print(f"  Error processing {model_name}: {str(e)}")
        continue
    # Clear logs after each model
    clear_logs()


Analyzing Phi-4 (microsoft/phi-4)...


2025-06-29 14:55:06,958 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C1...


2025-06-29 14:55:07,797 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C2...


2025-06-29 14:55:08,910 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C3...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-06-29 14:55:10,532 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C4...


2025-06-29 14:55:11,462 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C5...

Analyzing Phi-3.5 (microsoft/Phi-3.5-mini-instruct)...

Analyzing Phi-3.5 (microsoft/Phi-3.5-mini-instruct)...


2025-06-29 14:55:12,744 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C1...


2025-06-29 14:55:13,646 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C2...


2025-06-29 14:55:14,916 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C3...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-06-29 14:55:16,623 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C4...


2025-06-29 14:55:17,655 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C5...

Analyzing Llama-3.1 (meta-llama/Llama-3.1-8B)...

Analyzing Llama-3.1 (meta-llama/Llama-3.1-8B)...


2025-06-29 14:55:19,239 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C1...


2025-06-29 14:55:20,125 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C2...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-06-29 14:55:22,180 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C3...


2025-06-29 14:55:23,155 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C4...


2025-06-29 14:55:24,124 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: False


  Processing grade C5...

Analyzing Bertimbau Base (neuralmind/bert-base-portuguese-cased)...

Analyzing Bertimbau Base (neuralmind/bert-base-portuguese-cased)...


2025-06-29 14:55:25,425 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C1...


2025-06-29 14:55:25,708 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C2...


2025-06-29 14:55:25,992 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C3...


2025-06-29 14:55:26,276 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C4...


2025-06-29 14:55:26,561 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C5...

Analyzing Bert Multilingual (google-bert/bert-base-multilingual-cased)...

Analyzing Bert Multilingual (google-bert/bert-base-multilingual-cased)...


2025-06-29 14:55:27,230 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C1...


2025-06-29 14:55:27,529 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C2...


2025-06-29 14:55:27,825 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C3...


2025-06-29 14:55:28,131 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C4...


2025-06-29 14:55:28,432 - __main__ - INFO - Tokenizer function parameters- Padding:max_length; Truncation: True; Use Full Context: False


  Processing grade C5...

Analyzing Phi-4-FullContext (microsoft/phi-4)...

Analyzing Phi-4-FullContext (microsoft/phi-4)...


2025-06-29 14:55:29,090 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C1...


2025-06-29 14:55:30,251 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C2...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-06-29 14:55:32,930 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C3...


2025-06-29 14:55:34,176 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C4...


2025-06-29 14:55:35,423 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C5...

Analyzing Phi-3.5-FullContext (microsoft/Phi-3.5-mini-instruct)...

Analyzing Phi-3.5-FullContext (microsoft/Phi-3.5-mini-instruct)...


2025-06-29 14:55:37,046 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C1...


2025-06-29 14:55:38,348 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C2...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-06-29 14:55:41,171 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C3...


2025-06-29 14:55:42,607 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C4...


2025-06-29 14:55:44,029 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C5...

Analyzing Llama-3.1-FullContext (meta-llama/Llama-3.1-8B)...

Analyzing Llama-3.1-FullContext (meta-llama/Llama-3.1-8B)...


2025-06-29 14:55:46,005 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C1...


2025-06-29 14:55:47,230 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C2...


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

2025-06-29 14:55:50,001 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C3...


2025-06-29 14:55:51,317 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C4...


2025-06-29 14:55:52,643 - __main__ - INFO - Tokenizer function parameters- Padding:longest; Truncation: False; Use Full Context: True


  Processing grade C5...


In [6]:
# Create DataFrame with results
results_df = pd.DataFrame(results)
results_df.sample(5)

Unnamed: 0,model,model_type,grade,mean,std,min,max,median,p95,p99
7,Phi-3.5,phi35_classification_lora,C3,3193.0,0.0,3193,3193,3193.0,3193.0,3193.0
5,Phi-3.5,phi35_classification_lora,C1,2775.0,0.0,2775,2775,2775.0,2775.0,2775.0
3,Phi-4,phi4_classification_lora,C4,2780.0,0.0,2780,2780,2780.0,2780.0,2780.0
33,Phi-3.5-FullContext,phi35_classification_lora,C4,4474.0,0.0,4474,4474,4474.0,4474.0,4474.0
31,Phi-3.5-FullContext,phi35_classification_lora,C2,5335.0,0.0,5335,5335,5335.0,5335.0,5335.0


## Results Summary

In [7]:
# Display summary statistics
print("Average Sequence Length by Model and Grade:")
pivot_table = results_df.pivot_table(
    values='mean', 
    index='model', 
    columns='grade', 
    aggfunc='first'
)
pivot_table.round(2)

Average Sequence Length by Model and Grade:


grade,C1,C2,C3,C4,C5
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bert Multilingual,512.0,512.0,512.0,512.0,512.0
Bertimbau Base,512.0,512.0,512.0,512.0,512.0
Llama-3.1,2479.0,3470.0,2774.0,2768.0,2921.0
Llama-3.1-FullContext,3600.0,4591.0,3895.0,3889.0,4042.0
Phi-3.5,2775.0,4042.0,3193.0,3181.0,3349.0
Phi-3.5-FullContext,4068.0,5335.0,4486.0,4474.0,4642.0
Phi-4,2489.0,3481.0,2785.0,2780.0,2932.0
Phi-4-FullContext,3606.0,4598.0,3902.0,3897.0,4049.0
