In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Change the directory to the Tables folder
TABLE_DIR = '/content/drive/MyDrive/LLM causality/Tables/'
FIG_DIR = '/content/drive/MyDrive/LLM causality/Figures/'

In [3]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer
import seaborn as sns

In [4]:
import pandas as pd
import re

# Define a dictionary mapping base model families to their pretraining token counts
BASE_MODEL_TOKENS = {
    # Llama models
    'llama-3': 15.0,
    'llama-3.1': 15.0,
    'llama-3.2': 15.0,
    'llama-2': 2.0,
    'llama-1': 1.4,

    # Mistral models
    'mistral-7b': 0.8,
    'mixtral': 1.0,

    # Gemma models - corrected values
    'gemma-2-2b': 2.0,
    'gemma-2-9b': 8.0,
    'gemma-2-27b': 13.0,
    'gemma-1': 2.0,

    # Qwen models
    'qwen2.5': 18.0,
    'qwen2': 7.0,
    'qwen1': 2.0,

    # Yi models
    'yi-1.5': 3.0,
    'yi-1': 3.0,

    # Phi models
    'phi-3': 3.3,
    'phi-2': 1.4,
    'phi-1.5': 0.03,
    'phi-1': 0.006,

    # Other models
    'falcon-180b': 3.5,
    'falcon-40b': 1.0,
    'falcon-7b': 1.5,
    'glm3': 3.9,
    'glm2': 1.4,
    'deepseek2': 8.0,
    'deepseek': 2.0,
    'mpt-30b': 1.5,
    'mpt-7b': 1.0,
    'stablelm': 1.5,
    'bloom': 0.366,
    'baichuan-3': 3.2,
    'baichuan-2': 2.6
}

# Define specific parameter size ranges for common model families with relaxed boundaries
# Format: (min_size, max_size, model_identifier)
PARAM_SIZE_RANGES = [
    # Llama-2 parameter sizes (relaxed ranges)
    (6.7, 7.5, 'llama-2-7b'),
    (12.5, 13.5, 'llama-2-13b'),
    (64.0, 71.0, 'llama-2-70b'),

    # Llama-3 parameter sizes (relaxed ranges)
    (7.8, 8.3, 'llama-3-8b'),
    (69.0, 72.0, 'llama-3-70b'),

    # Mistral parameter sizes
    (7.0, 7.5, 'mistral-7b'),

    # Mixtral parameter sizes
    (45.0, 48.0, 'mixtral-8x7b'),

    # Gemma-2 parameter sizes with corrected ranges
    (1.8, 2.2, 'gemma-2-2b'),  # Added 2B model
    (8.9, 9.5, 'gemma-2-9b'),
    (26.5, 29.0, 'gemma-2-27b'),

    # Qwen2 parameter sizes
    (0.45, 0.55, 'qwen2-0.5b'),
    (1.4, 1.7, 'qwen2-1.5b'),
    (7.4, 7.8, 'qwen2-7b'),
    (14.0, 15.0, 'qwen2-14b'),
    (72.0, 73.5, 'qwen2-72b'),

    # Phi-3 parameter sizes
    (3.7, 4.0, 'phi-3-4b'),
    (6.8, 7.2, 'phi-3-7b'),
    (13.0, 15.0, 'phi-3-14b')
]

def extract_size_from_name(name):
    """
    Extract the model size (e.g., 7b, 13b, 70b) from a model name

    Args:
        name (str): The model name to analyze

    Returns:
        str: The extracted size or None if not found
    """
    # Look for common size patterns like 7b, 13b, 70b, etc.
    size_patterns = [
        r'[\-_](\d+\.?\d*)b(?!\w)',  # matches -7b, _13b, -70b, etc.
        r'(\d+\.?\d*)b[\-_]',        # matches 7b-, 13b_, etc.
        r'[\-_](\d+\.?\d*)B(?!\w)',  # matches -7B, _13B, -70B, etc.
        r'(\d+\.?\d*)B[\-_]',        # matches 7B-, 13B_, etc.
        r'/(\d+\.?\d*)[bB](?![a-zA-Z0-9])', # matches /7b, /13B, etc.
    ]

    for pattern in size_patterns:
        match = re.search(pattern, name)
        if match:
            size = match.group(1)
            return f"{size}b"

    return None

def match_param_size_to_model(param_size):
    """
    Match parameter size to known model variants

    Args:
        param_size (float): Parameter size in billions

    Returns:
        str or None: Matched model identifier or None if no match
    """
    if pd.isna(param_size):
        return None

    for min_size, max_size, model_id in PARAM_SIZE_RANGES:
        if min_size <= param_size <= max_size:
            return model_id

    return None

def get_explicit_base_model(name):
    """
    Check if the model name explicitly mentions a specific base model

    Args:
        name (str): The model name

    Returns:
        str or None: Identified base model with size or None if not found
    """
    if pd.isna(name):
        return None

    name_lower = name.lower()

    # First, check for exact matches without capture groups
    exact_patterns = {
        r'gemma[\-_]?2[\-_]?2[bB]': 'gemma-2-2b',     # Explicitly match 2B
        r'gemma[\-_]?2[\-_]?9[bB]': 'gemma-2-9b',     # Explicitly match 9B
        r'gemma[\-_]?2[\-_]?27[bB]': 'gemma-2-27b',   # Explicitly match 27B
        r'mixtral[\-_]?8x7[bB]': 'mixtral-8x7b'       # Explicitly match mixtral-8x7b
    }

    # Check exact patterns first
    for pattern, result in exact_patterns.items():
        if re.search(pattern, name_lower):
            return result

    # Then check patterns with capture groups
    # Llama models
    llama_patterns = [
        (r'llama[\-_]?3\.2[\-_]?(\d+\.?\d*)[bB]', 'llama-3.2-{}b'),
        (r'llama[\-_]?3\.1[\-_]?(\d+\.?\d*)[bB]', 'llama-3.1-{}b'),
        (r'llama[\-_]?3(?!\.)[\-_]?(\d+\.?\d*)[bB]', 'llama-3-{}b'),
        (r'llama[\-_]?2[\-_]?(\d+\.?\d*)[bB]', 'llama-2-{}b'),
        (r'llama[\-_]?1[\-_]?(\d+\.?\d*)[bB]', 'llama-1-{}b')
    ]

    # Mistral models
    mistral_patterns = [
        (r'mistral[\-_]?(\d+\.?\d*)[bB]', 'mistral-{}b')
    ]

    # Gemma models - with capture groups for variable sizes
    gemma_patterns = [
        (r'gemma[\-_]?2[\-_]?(\d+\.?\d*)[bB]', 'gemma-2-{}b'),  # For other sizes
        (r'gemma(?![\-_]?2)[\-_]?(\d+\.?\d*)[bB]', 'gemma-1-{}b')
    ]

    # Qwen models
    qwen_patterns = [
        (r'qwen[\-_]?2\.5[\-_]?(\d+\.?\d*)[bB]', 'qwen2.5-{}b'),
        (r'qwen[\-_]?2(?!\.5)[\-_]?(\d+\.?\d*)[bB]', 'qwen2-{}b'),
        (r'qwen(?![\-_]?2)[\-_]?(\d+\.?\d*)[bB]', 'qwen1-{}b')
    ]

    # Yi models
    yi_patterns = [
        (r'yi[\-_]?1\.5[\-_]?(\d+\.?\d*)[bB]', 'yi-1.5-{}b'),
        (r'yi(?![\-_]?1\.5)[\-_]?1[\-_]?(\d+\.?\d*)[bB]', 'yi-1-{}b')
    ]

    # Phi models
    phi_patterns = [
        (r'phi[\-_]?3[\-_]?(\d+\.?\d*)[bB]', 'phi-3-{}b'),
        (r'phi[\-_]?2[\-_]?(\d+\.?\d*)[bB]', 'phi-2-{}b'),
        (r'phi[\-_]?1\.5[\-_]?(\d+\.?\d*)[bB]', 'phi-1.5-{}b'),
        (r'phi[\-_]?1(?!\.5)[\-_]?(\d+\.?\d*)[bB]', 'phi-1-{}b')
    ]

    # Other models
    other_patterns = [
        (r'falcon[\-_]?(\d+\.?\d*)[bB]', 'falcon-{}b'),
        (r'mpt[\-_]?(\d+\.?\d*)[bB]', 'mpt-{}b')
    ]

    # Combine all patterns
    all_patterns = llama_patterns + mistral_patterns + gemma_patterns + qwen_patterns + yi_patterns + phi_patterns + other_patterns

    # Check each pattern
    for pattern, template in all_patterns:
        match = re.search(pattern, name_lower)
        if match:
            size = match.group(1)
            return template.format(size)

    # Special cases that don't follow the standard pattern
    if 'mixtral' in name_lower and '8x7b' in name_lower:
        return 'mixtral-8x7b'

    return None

def extract_base_model_from_name(model_name, param_size=None):
    """
    Extract the base model including size from a model name

    Args:
        model_name (str): The model name to analyze
        param_size (float, optional): Parameter size in billions

    Returns:
        tuple: (base_model_family, full_base_model_with_size)
    """
    if pd.isna(model_name):
        return None, None

    name = model_name.lower()

    # First, check for explicit base model mention
    explicit_base = get_explicit_base_model(name)
    if explicit_base:
        # Extract the family from the explicit base
        family_match = re.match(r'([a-z0-9\.\-]+)-\d+', explicit_base)
        if family_match:
            family = family_match.group(1)

            # Special handling for Gemma-2 with different sizes
            if family == 'gemma-2':
                size_match = re.search(r'gemma-2-(\d+)b', explicit_base)
                if size_match:
                    size = size_match.group(1)
                    if size == '2':
                        return 'gemma-2-2b', explicit_base
                    elif size == '9':
                        return 'gemma-2-9b', explicit_base
                    elif size == '27':
                        return 'gemma-2-27b', explicit_base

            return family, explicit_base

    # If no explicit base, try to identify the family
    base_model_family = None

    # Try to match base models in descending order of specificity
    if re.search(r'llama[\-_]?3\.2|llama[/\\]3\.2', name):
        base_model_family = 'llama-3.2'
    elif re.search(r'llama[\-_]?3\.1|llama[/\\]3\.1', name):
        base_model_family = 'llama-3.1'
    elif re.search(r'llama[\-_]?3(?!\.)|llama[/\\]3(?!\.)', name):
        base_model_family = 'llama-3'
    elif re.search(r'llama[\-_]?2|llama[/\\]2', name):
        base_model_family = 'llama-2'
    elif re.search(r'llama[\-_]?1|llama[/\\]1', name):
        base_model_family = 'llama-1'
    elif re.search(r'mistral[\-_]?7b|mistral[/\\]7b', name):
        base_model_family = 'mistral-7b'
    elif 'mixtral' in name:
        base_model_family = 'mixtral'
    elif re.search(r'gemma[\-_]?2[\-_]?2b|gemma[/\\]2[\-_]?2b', name):
        base_model_family = 'gemma-2-2b'
    elif re.search(r'gemma[\-_]?2[\-_]?9b|gemma[/\\]2[\-_]?9b', name):
        base_model_family = 'gemma-2-9b'
    elif re.search(r'gemma[\-_]?2[\-_]?27b|gemma[/\\]2[\-_]?27b', name):
        base_model_family = 'gemma-2-27b'
    elif re.search(r'gemma[\-_]?2|gemma[/\\]2', name):
        # If gemma-2 without size, we need to determine from param_size
        if param_size:
            if 1.8 <= param_size <= 2.2:
                base_model_family = 'gemma-2-2b'
            elif 8.9 <= param_size <= 9.5:
                base_model_family = 'gemma-2-9b'
            elif 26.5 <= param_size <= 28.5:
                base_model_family = 'gemma-2-27b'
    elif re.search(r'gemma[\-_]?1|gemma[/\\]1', name):
        base_model_family = 'gemma-1'
    elif re.search(r'qwen[\-_]?2\.5|qwen[/\\]2\.5', name):
        base_model_family = 'qwen2.5'
    elif re.search(r'qwen[\-_]?2(?!\.)|qwen[/\\]2(?!\.)', name):
        base_model_family = 'qwen2'
    elif re.search(r'qwen[\-_]?1|qwen[/\\]1', name):
        base_model_family = 'qwen1'
    elif re.search(r'yi[\-_]?1\.5|yi[/\\]1\.5', name):
        base_model_family = 'yi-1.5'
    elif re.search(r'yi[\-_]?1(?!\.)|yi[/\\]1(?!\.)', name):
        base_model_family = 'yi-1'
    elif re.search(r'phi[\-_]?3|phi[/\\]3', name):
        base_model_family = 'phi-3'
    elif re.search(r'phi[\-_]?2|phi[/\\]2', name):
        base_model_family = 'phi-2'
    elif re.search(r'phi[\-_]?1\.5|phi[/\\]1\.5', name):
        base_model_family = 'phi-1.5'
    elif re.search(r'phi[\-_]?1(?!\.)|phi[/\\]1(?!\.)', name):
        base_model_family = 'phi-1'
    elif re.search(r'falcon[\-_]?180b|falcon[/\\]180b', name):
        base_model_family = 'falcon-180b'
    elif re.search(r'falcon[\-_]?40b|falcon[/\\]40b', name):
        base_model_family = 'falcon-40b'
    elif re.search(r'falcon[\-_]?7b|falcon[/\\]7b', name):
        base_model_family = 'falcon-7b'
    elif re.search(r'chatglm3|glm3', name):
        base_model_family = 'glm3'
    elif re.search(r'chatglm2|glm2', name):
        base_model_family = 'glm2'
    elif re.search(r'deepseek[\-_]?llm[\-_]?2|deepseek2', name):
        base_model_family = 'deepseek2'
    elif 'deepseek' in name:
        base_model_family = 'deepseek'
    elif re.search(r'mpt[\-_]?30b|mpt[/\\]30b', name):
        base_model_family = 'mpt-30b'
    elif re.search(r'mpt[\-_]?7b|mpt[/\\]7b', name):
        base_model_family = 'mpt-7b'
    elif 'stablelm' in name:
        base_model_family = 'stablelm'
    elif 'bloom' in name:
        base_model_family = 'bloom'
    elif re.search(r'baichuan[\-_]?3|baichuan[/\\]3', name):
        base_model_family = 'baichuan-3'
    elif re.search(r'baichuan[\-_]?2|baichuan[/\\]2', name):
        base_model_family = 'baichuan-2'

    # If we identified a base model family
    if base_model_family:
        # Check if the base model already includes size info (like mistral-7b)
        if re.search(r'\d+b$', base_model_family):
            return base_model_family, base_model_family

        # If size is in the name, use it
        size_from_name = extract_size_from_name(name)
        if size_from_name:
            return base_model_family, f"{base_model_family}-{size_from_name}"

        # If size isn't in the name but we have param_size, use that
        if param_size:
            # Try to match to known parameter size ranges
            param_match = match_param_size_to_model(param_size)
            if param_match and param_match.startswith(base_model_family):
                return base_model_family, param_match

            # Fallback to generic size format
            return base_model_family, f"{base_model_family}-{int(param_size)}b"

        # Return just the family if we can't determine size
        return base_model_family, base_model_family

    # No base model identified
    return None, None

def architecture_to_base_model(architecture, param_size, model_name):
    """
    Map architecture and parameter size to a likely base model with size

    Args:
        architecture (str): Model architecture
        param_size (float): Parameter size in billions
        model_name (str): The model name (to check for Qwen version)

    Returns:
        tuple: (base_model_family, full_base_model_with_size)
    """
    if pd.isna(architecture) or pd.isna(param_size):
        return None, None

    arch = architecture.lower()
    name_lower = str(model_name).lower() if not pd.isna(model_name) else ""

    # Check if we can match the parameter size to a known model
    param_match = match_param_size_to_model(param_size)

    # For LlamaForCausalLM, be more careful
    if 'llama' in arch:
        # Only match if parameter size fits known Llama models
        if param_match and ('llama-2' in param_match or 'llama-3' in param_match):
            family = param_match.rsplit('-', 1)[0]  # Extract family from match
            return family, param_match
        return None, None  # Don't guess if size doesn't match

    # For non-Llama architectures, we can be more confident
    if 'mistral' in arch and not 'mixtral' in arch:
        if 7.0 <= param_size <= 7.5:
            return 'mistral-7b', 'mistral-7b'

    if 'mixtral' in arch or 'mistral' in arch and 'moe' in name_lower:
        if 45.0 <= param_size <= 48.0:
            return 'mixtral', 'mixtral-8x7b'

    if 'gemma2' in arch:
        if 1.8 <= param_size <= 2.2:
            return 'gemma-2-2b', 'gemma-2-2b'
        elif 8.9 <= param_size <= 9.5:
            return 'gemma-2-9b', 'gemma-2-9b'
        elif 27.0 <= param_size <= 29.0:
            return 'gemma-2-27b', 'gemma-2-27b'

    if 'gemma' in arch and 'gemma2' not in arch:
        if 2.0 <= param_size <= 3.0:
            return 'gemma-1', 'gemma-1-2b'
        elif 7.0 <= param_size <= 9.0:
            return 'gemma-1', 'gemma-1-7b'

    # For Qwen2, check the name first to disambiguate between Qwen2 and Qwen2.5
    if 'qwen2' in arch:
        if 'qwen2.5' in name_lower:
            base_family = 'qwen2.5'
        elif 3.0 <= param_size <= 3.2 or 14.0 <= param_size <= 15.0 or 32.0 <= param_size <= 35.0:
            base_family = 'qwen2.5'
        elif 'qwen2' in name_lower:
            base_family = 'qwen2'
        else:
            # If name doesn't clarify, default to Qwen2 (the architecture name)
            base_family = 'qwen2'

        # Now determine the size
        if param_match and param_match.startswith(base_family):
            return base_family, param_match
        elif 0.45 <= param_size <= 0.55:
            return base_family, f"{base_family}-0.5b"
        elif 1.4 <= param_size <= 1.7:
            return base_family, f"{base_family}-1.5b"
        elif 3.0 <= param_size <= 3.2:
            return base_family, f"{base_family}-3b"
        elif 7.4 <= param_size <= 7.8:
            return base_family, f"{base_family}-7b"
        elif 14.0 <= param_size <= 15.0:
            return base_family, f"{base_family}-14b"
        elif 72.0 <= param_size <= 73.5:
            return base_family, f"{base_family}-72b"

    if 'falcon' in arch:
        if 175.0 <= param_size <= 185.0:
            return 'falcon-180b', 'falcon-180b'
        elif 39.0 <= param_size <= 41.0:
            return 'falcon-40b', 'falcon-40b'
        elif 6.5 <= param_size <= 7.5:
            return 'falcon-7b', 'falcon-7b'

    if 'phi' in arch:
        if 'phi-3' in name_lower:
            base_family = 'phi-3'
        elif 'phi-2' in name_lower:
            base_family = 'phi-2'
        elif 'phi-1.5' in name_lower:
            base_family = 'phi-1.5'
        elif 'phi-1' in name_lower and 'phi-1.5' not in name_lower:
            base_family = 'phi-1'
        else:
            # If name doesn't clarify, try to infer from param size
            if param_size > 10:
                base_family = 'phi-3'
            elif param_size > 2:
                base_family = 'phi-2'
            elif param_size > 1:
                base_family = 'phi-1.5'
            else:
                base_family = 'phi-1'

        if param_match and param_match.startswith(base_family):
            return base_family, param_match
        else:
            return base_family, f"{base_family}-{int(param_size)}b"

    # No match found
    return None, None

def determine_pretraining_tokens(row):
    """
    Comprehensive approach to determine pretraining token count

    Args:
        row: A pandas DataFrame row containing model information

    Returns:
        tuple: (token_count, base_model_family, full_base_model_with_size)
    """
    # Extract relevant fields
    model_name = row.get('fullname', '')
    architecture = row.get('Architecture', '')
    param_size = row.get('#Params (B)', 0)
    base_model_field = row.get('Base Model', '')

    # Check for NaN values
    if pd.isna(model_name):
        return None, None, None

    # 1. First check if the name explicitly mentions a base model
    explicit_base = get_explicit_base_model(str(model_name))
    if explicit_base:
        family_match = re.match(r'([a-z0-9\.\-]+)-\d+', explicit_base)
        if family_match:
            family = family_match.group(1)
            if family in BASE_MODEL_TOKENS:
                return BASE_MODEL_TOKENS[family], family, explicit_base

    # 2. Check if Base Model field contains useful information
    if not pd.isna(base_model_field):
        explicit_base = get_explicit_base_model(str(base_model_field))
        if explicit_base:
            family_match = re.match(r'([a-z0-9\.\-]+)-\d+', explicit_base)
            if family_match:
                family = family_match.group(1)
                if family in BASE_MODEL_TOKENS:
                    return BASE_MODEL_TOKENS[family], family, explicit_base

        base_family, base_with_size = extract_base_model_from_name(str(base_model_field), param_size)
        if base_family and base_family in BASE_MODEL_TOKENS:
            return BASE_MODEL_TOKENS[base_family], base_family, base_with_size

    # 3. Try to extract base model from the model name
    base_family, base_with_size = extract_base_model_from_name(model_name, param_size)
    if base_family and base_family in BASE_MODEL_TOKENS:
        return BASE_MODEL_TOKENS[base_family], base_family, base_with_size

    # 4. Use architecture and parameter size
    if not pd.isna(architecture) and not pd.isna(param_size):
        base_family, base_with_size = architecture_to_base_model(architecture, param_size, model_name)
        if base_family and base_family in BASE_MODEL_TOKENS:
            return BASE_MODEL_TOKENS[base_family], base_family, base_with_size

    # No match found
    return None, None, None

def process_csv(input_file, output_file):
    """
    Process a CSV file to add pretraining token information

    Args:
        input_file (str): Path to input CSV file
        output_file (str): Path to save the output CSV file
    """
    # Read the CSV file
    df = pd.read_csv(input_file)

    # Apply the function to determine pretraining tokens and base models
    results = df.apply(determine_pretraining_tokens, axis=1)

    # Unpack the results
    df['Pretraining tokens (T)'] = [r[0] for r in results]
    df['Base model family'] = [r[1] for r in results]
    df['Identified base model'] = [r[2] for r in results]

    # Save the updated dataframe to a new CSV file
    df.to_csv(output_file, index=False)

    # Print summary statistics
    token_counts = df['Pretraining tokens (T)'].value_counts().sort_index()
    matched = df['Pretraining tokens (T)'].notna().sum()
    total = len(df)

    print(f"Matched {matched} out of {total} models ({matched/total*100:.2f}%)")
    print("\nDistribution of pretraining token counts:")
    for tokens, count in token_counts.items():
        print(f"{tokens} trillion: {count} models")

    # Print base model statistics
    base_model_counts = df['Base model family'].value_counts().sort_values(ascending=False)
    print("\nDistribution of identified base model families:")
    for base_model, count in base_model_counts.items():
        if pd.isna(base_model):
            print(f"Unknown: {count} models")
        else:
            print(f"{base_model}: {count} models")

    # Print some examples for verification
    print("\nSample models for each token value:")
    for tokens in token_counts.index:
        examples = df[df['Pretraining tokens (T)'] == tokens][['fullname', 'Identified base model', 'Architecture', '#Params (B)']].head(3)
        print(f"\n{tokens} trillion tokens examples:")
        for _, example in examples.iterrows():
            print(f"  - {example['fullname']} (Base: {example['Identified base model']}, Arch: {example['Architecture']}, Params: {example['#Params (B)']}B)")

process_csv(TABLE_DIR + 'open_llm_leaderboard.csv', TABLE_DIR + 'open_llm_leaderboard_with_tokens.csv')
# process_csv(TABLE_DIR + 'open_llm_leaderboard_old.csv', TABLE_DIR + 'open_llm_leaderboard_old_with_tokens.csv')


Matched 3360 out of 4576 models (73.43%)

Distribution of pretraining token counts:
0.006 trillion: 4 models
0.366 trillion: 6 models
0.8 trillion: 306 models
1.0 trillion: 26 models
1.4 trillion: 71 models
1.5 trillion: 12 models
2.0 trillion: 291 models
3.0 trillion: 21 models
3.3 trillion: 69 models
7.0 trillion: 351 models
8.0 trillion: 135 models
13.0 trillion: 25 models
15.0 trillion: 1252 models
18.0 trillion: 791 models

Distribution of identified base model families:
llama-3: 807 models
qwen2.5: 791 models
qwen2: 351 models
mistral-7b: 306 models
llama-3.1: 278 models
llama-3.2: 167 models
gemma-2-9b: 135 models
llama-2: 118 models
qwen1: 89 models
phi-3: 69 models
phi-2: 51 models
gemma-1: 38 models
gemma-2-2b: 29 models
gemma-2-27b: 25 models
mixtral: 24 models
yi-1.5: 20 models
llama-1: 20 models
deepseek: 17 models
stablelm: 7 models
bloom: 6 models
falcon-7b: 5 models
phi-1: 4 models
falcon-40b: 2 models
yi-1: 1 models

Sample models for each token value:

0.006 trillion 