In [1]:
from datasets import load_dataset

# Load the SQuAD dataset
dataset = load_dataset('rajpurkar/squad_v2')

In [None]:
################ for Qwen2.5 ##########################

import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    question_tokens_list = []
    answer_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        question = example['question']
        answers = example['answers']['text'] 
        if answers: 
            answer = answers[0] 
        else:
            answer = ""
        
        question_tokens = tokenize_and_count(question)
        answer_tokens = tokenize_and_count(answer)
        total_tokens += question_tokens + answer_tokens
        question_tokens_list.append(question_tokens)
        answer_tokens_list.append(answer_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_question_tokens = np.mean(question_tokens_list)
    average_answer_tokens = np.mean(answer_tokens_list)
    std_dev_tokens_per_sample = np.std([q + a for q, a in zip(question_tokens_list, answer_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per question": average_question_tokens,
        "Average tokens per answer": average_answer_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

results = {}
for split in dataset.keys():
    results[split] = compute_statistics(dataset[split])

for split, stats in results.items():
    print(f"Statistics for {split} split:")
    for key, value in stats.items():
        print(f"{key}: {value}")
    print("\n")

Statistics for train split:
Number of samples: 130319
Total number of tokens: 2082288
Average tokens per sample: 15.978391485508636
Average tokens per question: 12.582186787805309
Average tokens per answer: 3.396204697703328
Standard deviation of tokens per sample: 6.075638479438013


Statistics for validation split:
Number of samples: 11873
Total number of tokens: 179075
Average tokens per sample: 15.082540217299755
Average tokens per question: 12.580308262444202
Average tokens per answer: 2.5022319548555547
Standard deviation of tokens per sample: 5.67542834227307




In [None]:
################ for Pythia-2.8b ##########################

import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")

def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    question_tokens_list = []
    answer_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        question = example['question']
        answers = example['answers']['text'] 
        if answers: 
            answer = answers[0] 
        else:
            answer = ""
        
        question_tokens = tokenize_and_count(question)
        answer_tokens = tokenize_and_count(answer)
        total_tokens += question_tokens + answer_tokens
        question_tokens_list.append(question_tokens)
        answer_tokens_list.append(answer_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_question_tokens = np.mean(question_tokens_list)
    average_answer_tokens = np.mean(answer_tokens_list)
    std_dev_tokens_per_sample = np.std([q + a for q, a in zip(question_tokens_list, answer_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per question": average_question_tokens,
        "Average tokens per answer": average_answer_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

results = {}
for split in dataset.keys():
    results[split] = compute_statistics(dataset[split])

for split, stats in results.items():
    print(f"Statistics for {split} split:")
    for key, value in stats.items():
        print(f"{key}: {value}")
    print("\n")

Statistics for train split:
Number of samples: 130319
Total number of tokens: 2024789
Average tokens per sample: 15.53717416493374
Average tokens per question: 12.403678665428679
Average tokens per answer: 3.133495499505061
Standard deviation of tokens per sample: 6.515199273577803


Statistics for validation split:
Number of samples: 11873
Total number of tokens: 174645
Average tokens per sample: 14.709424745220248
Average tokens per question: 12.376400235829193
Average tokens per answer: 2.3330245093910555
Standard deviation of tokens per sample: 5.432816823853024




In [None]:


import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    question_tokens_list = []
    answer_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        question = example['question']
        answers = example['answer']['value'] 
        if isinstance(answers, list): 
            answer = answers[0] if answers else ""
        else:
            answer = answers

        question_tokens = tokenize_and_count(question)
        answer_tokens = tokenize_and_count(answer)
        total_tokens += question_tokens + answer_tokens
        question_tokens_list.append(question_tokens)
        answer_tokens_list.append(answer_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_question_tokens = np.mean(question_tokens_list)
    average_answer_tokens = np.mean(answer_tokens_list)
    std_dev_tokens_per_sample = np.std([q + a for q, a in zip(question_tokens_list, answer_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per question": average_question_tokens,
        "Average tokens per answer": average_answer_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

configs = ["rc", "rc.nocontext", "unfiltered", "unfiltered.nocontext"]
results = {}

for config in configs:
    dataset = load_dataset("mandarjoshi/trivia_qa", config)
    config_results = {}
    for split in dataset.keys():
        config_results[split] = compute_statistics(dataset[split])
    results[config] = config_results

for config, splits in results.items():
    print(f"Results for configuration: {config}")
    for split, stats in splits.items():
        print(f"  Split: {split}")
        for key, value in stats.items():
            print(f"    {key}: {value}")
        print()


Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/55.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.34M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/138384 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/17944 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17210 [00:00<?, ? examples/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/47 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/47 [00:00<?, ?files/s]

Downloading data:   0%|          | 0.00/212M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/259M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/229M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/235M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/316M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/300M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/266M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/295M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/251M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87622 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11313 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10832 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/36 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/33.2M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.39M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/762k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87622 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11313 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10832 [00:00<?, ? examples/s]

Results for configuration: rc
  Split: train
    Number of samples: 138384
    Total number of tokens: 3387283
    Average tokens per sample: 24.477417909584922
    Average tokens per question: 20.75877269048445
    Average tokens per answer: 3.718645219100474
    Standard deviation of tokens per sample: 11.268288609224047

  Split: validation
    Number of samples: 17944
    Total number of tokens: 439487
    Average tokens per sample: 24.492142220240748
    Average tokens per question: 20.784106107891215
    Average tokens per answer: 3.708036112349532
    Standard deviation of tokens per sample: 11.268767206697019

  Split: test
    Number of samples: 17210
    Total number of tokens: 375978
    Average tokens per sample: 21.846484601975597
    Average tokens per question: 20.846484601975597
    Average tokens per answer: 1.0
    Standard deviation of tokens per sample: 10.875440809538137

Results for configuration: rc.nocontext
  Split: train
    Number of samples: 138384
    Total

In [7]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    question_tokens_list = []
    answer_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        question = example['question']
        answer = example['answers']['text'][0] 
        question_tokens = tokenize_and_count(question)
        answer_tokens = tokenize_and_count(answer)
        total_tokens += question_tokens + answer_tokens
        question_tokens_list.append(question_tokens)
        answer_tokens_list.append(answer_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_question_tokens = np.mean(question_tokens_list)
    average_answer_tokens = np.mean(answer_tokens_list)
    std_dev_tokens_per_sample = np.std([q + a for q, a in zip(question_tokens_list, answer_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per question": average_question_tokens,
        "Average tokens per answer": average_answer_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

# Available configurations
configs = [ 'mlqa.en.en']

results = {}

for config in configs:
    dataset = load_dataset("facebook/mlqa", config)
    config_results = {}
    for split in dataset.keys():
        config_results[split] = compute_statistics(dataset[split])
    results[config] = config_results

for config, splits in results.items():
    print(f"Results for configuration: {config}")
    for split, stats in splits.items():
        print(f"  Split: {split}")
        for key, value in stats.items():
            print(f"    {key}: {value}")
        print()

Results for configuration: mlqa.en.en
  Split: test
    Number of samples: 11590
    Total number of tokens: 184175
    Average tokens per sample: 15.890854184641933
    Average tokens per question: 10.623468507333909
    Average tokens per answer: 5.267385677308024
    Standard deviation of tokens per sample: 6.207366622975092

  Split: validation
    Number of samples: 1148
    Total number of tokens: 18292
    Average tokens per sample: 15.933797909407666
    Average tokens per question: 10.85191637630662
    Average tokens per answer: 5.081881533101045
    Standard deviation of tokens per sample: 6.0972782413295485



In [8]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    question_tokens_list = []
    answer_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        question = example['question']
        answer = example['answers']['text'][0] 
        question_tokens = tokenize_and_count(question)
        answer_tokens = tokenize_and_count(answer)
        total_tokens += question_tokens + answer_tokens
        question_tokens_list.append(question_tokens)
        answer_tokens_list.append(answer_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_question_tokens = np.mean(question_tokens_list)
    average_answer_tokens = np.mean(answer_tokens_list)
    std_dev_tokens_per_sample = np.std([q + a for q, a in zip(question_tokens_list, answer_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per question": average_question_tokens,
        "Average tokens per answer": average_answer_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

# Available configurations
configs = [ 'mlqa.de.de']

results = {}

for config in configs:
    dataset = load_dataset("facebook/mlqa", config)
    config_results = {}
    for split in dataset.keys():
        config_results[split] = compute_statistics(dataset[split])
    results[config] = config_results

for config, splits in results.items():
    print(f"Results for configuration: {config}")
    for split, stats in splits.items():
        print(f"  Split: {split}")
        for key, value in stats.items():
            print(f"    {key}: {value}")
        print()

Results for configuration: mlqa.de.de
  Split: test
    Number of samples: 4517
    Total number of tokens: 101868
    Average tokens per sample: 22.55213637369936
    Average tokens per question: 14.772636705778172
    Average tokens per answer: 7.779499667921186
    Standard deviation of tokens per sample: 10.541430453552449

  Split: validation
    Number of samples: 512
    Total number of tokens: 11547
    Average tokens per sample: 22.552734375
    Average tokens per question: 14.966796875
    Average tokens per answer: 7.5859375
    Standard deviation of tokens per sample: 10.359417531198043



In [6]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")

def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    question_tokens_list = []
    answer_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        question = example['question']
        answer = example['answers']['text'][0] 
        question_tokens = tokenize_and_count(question)
        answer_tokens = tokenize_and_count(answer)
        total_tokens += question_tokens + answer_tokens
        question_tokens_list.append(question_tokens)
        answer_tokens_list.append(answer_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_question_tokens = np.mean(question_tokens_list)
    average_answer_tokens = np.mean(answer_tokens_list)
    std_dev_tokens_per_sample = np.std([q + a for q, a in zip(question_tokens_list, answer_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per question": average_question_tokens,
        "Average tokens per answer": average_answer_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

# Available configurations
configs = [ 'mlqa.en.en']

results = {}

for config in configs:
    dataset = load_dataset("facebook/mlqa", config)
    config_results = {}
    for split in dataset.keys():
        config_results[split] = compute_statistics(dataset[split])
    results[config] = config_results

for config, splits in results.items():
    print(f"Results for configuration: {config}")
    for split, stats in splits.items():
        print(f"  Split: {split}")
        for key, value in stats.items():
            print(f"    {key}: {value}")
        print()

Results for configuration: mlqa.en.en
  Split: test
    Number of samples: 11590
    Total number of tokens: 177089
    Average tokens per sample: 15.279465056082831
    Average tokens per question: 10.537877480586713
    Average tokens per answer: 4.741587575496117
    Standard deviation of tokens per sample: 6.07776618660986

  Split: validation
    Number of samples: 1148
    Total number of tokens: 17611
    Average tokens per sample: 15.340592334494774
    Average tokens per question: 10.774390243902438
    Average tokens per answer: 4.566202090592334
    Standard deviation of tokens per sample: 6.035163251143683



In [1]:
######################### For pythia model in zero-shot dataset ########################

import os
import json
import numpy as np
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")
#tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

# Function to tokenize and count tokens
def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

# Function to compute statistics for a dataset split
def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    prompt_tokens_list = []
    output_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        prompt = example['prompt']
        output = example['output']
        
        prompt_tokens = tokenize_and_count(prompt)
        output_tokens = tokenize_and_count(output)
        
        total_tokens += prompt_tokens + output_tokens
        prompt_tokens_list.append(prompt_tokens)
        output_tokens_list.append(output_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_prompt_tokens = np.mean(prompt_tokens_list)
    average_output_tokens = np.mean(output_tokens_list)
    std_dev_tokens_per_sample = np.std([p + o for p, o in zip(prompt_tokens_list, output_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per prompt": average_prompt_tokens,
        "Average tokens per output": average_output_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

# Load the dataset from a given directory
def load_dataset_from_path(dataset_path, splits):
    dataset = {}
    for split in splits:
        split_path = os.path.join(dataset_path, f"{split}.jsonl")
        if os.path.exists(split_path):
            with open(split_path, 'r', encoding='utf-8') as f:
                dataset[split] = [json.loads(line) for line in f]
    return dataset

# Define the paths
train_val_path = "/home/IAIS/jdatta/distillm-new/processed_data_pythia70m/squad_v2/full/pythia/"  
eval_path = "/home/IAIS/jdatta/distillm-new/data/squad_json/"                

# Load datasets
train_val_splits = ['train', 'valid']
eval_splits = ['eval']

train_val_dataset = load_dataset_from_path(train_val_path, train_val_splits)
eval_dataset = load_dataset_from_path(eval_path, eval_splits)

# Compute statistics
results = {}

# Training and validation splits
for split, data_split in train_val_dataset.items():
    results[split] = compute_statistics(data_split)

# Evaluation split
for split, data_split in eval_dataset.items():
    results[split] = compute_statistics(data_split)

# Print results
for split, stats in results.items():
    print(f"Statistics for split: {split}")
    for key, value in stats.items():
        print(f"  {key}: {value}")
    print()

Statistics for split: train
  Number of samples: 104255
  Total number of tokens: 19312379
  Average tokens per sample: 185.24175339312262
  Average tokens per prompt: 180.7788499352549
  Average tokens per output: 4.4629034578677285
  Standard deviation of tokens per sample: 66.86586606323264

Statistics for split: valid
  Number of samples: 26064
  Total number of tokens: 4838499
  Average tokens per sample: 185.63915745856355
  Average tokens per prompt: 181.1476749539595
  Average tokens per output: 4.491482504604051
  Standard deviation of tokens per sample: 68.11135825870062

Statistics for split: eval
  Number of samples: 11873
  Total number of tokens: 2291926
  Average tokens per sample: 193.03680619893876
  Average tokens per prompt: 188.7009180493557
  Average tokens per output: 4.335888149583088
  Standard deviation of tokens per sample: 75.52868821632072



In [2]:
######################### For pythia model in one-shot dataset ########################

import os
import json
import numpy as np
from transformers import AutoTokenizer

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")
#tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B")

# Function to tokenize and count tokens
def tokenize_and_count(text):
    return len(tokenizer.tokenize(text))

# Function to compute statistics for a dataset split
def compute_statistics(data_split):
    num_samples = 0
    total_tokens = 0
    prompt_tokens_list = []
    output_tokens_list = []
    
    for example in data_split:
        num_samples += 1
        prompt = example['prompt']
        output = example['output']
        
        prompt_tokens = tokenize_and_count(prompt)
        output_tokens = tokenize_and_count(output)
        
        total_tokens += prompt_tokens + output_tokens
        prompt_tokens_list.append(prompt_tokens)
        output_tokens_list.append(output_tokens)
    
    average_tokens_per_sample = total_tokens / num_samples
    average_prompt_tokens = np.mean(prompt_tokens_list)
    average_output_tokens = np.mean(output_tokens_list)
    std_dev_tokens_per_sample = np.std([p + o for p, o in zip(prompt_tokens_list, output_tokens_list)])
    
    return {
        "Number of samples": num_samples,
        "Total number of tokens": total_tokens,
        "Average tokens per sample": average_tokens_per_sample,
        "Average tokens per prompt": average_prompt_tokens,
        "Average tokens per output": average_output_tokens,
        "Standard deviation of tokens per sample": std_dev_tokens_per_sample
    }

# Load the dataset from a given directory
def load_dataset_from_path(dataset_path, splits):
    dataset = {}
    for split in splits:
        split_path = os.path.join(dataset_path, f"{split}.jsonl")
        if os.path.exists(split_path):
            with open(split_path, 'r', encoding='utf-8') as f:
                dataset[split] = [json.loads(line) for line in f]
    return dataset

# Define the paths
train_val_path = "/home/IAIS/jdatta/distillm-new/processed_data_pythia70m/squad_1shot/full/pythia/"  
eval_path = "/home/IAIS/jdatta/distillm-new/data/squad_1shot/"                

# Load datasets
train_val_splits = ['train', 'valid']
eval_splits = ['eval']

train_val_dataset = load_dataset_from_path(train_val_path, train_val_splits)
eval_dataset = load_dataset_from_path(eval_path, eval_splits)

# Compute statistics
results = {}

# Training and validation splits
for split, data_split in train_val_dataset.items():
    results[split] = compute_statistics(data_split)

# Evaluation split
for split, data_split in eval_dataset.items():
    results[split] = compute_statistics(data_split)

# Print results
for split, stats in results.items():
    print(f"Statistics for split: {split}")
    for key, value in stats.items():
        print(f"  {key}: {value}")
    print()

Statistics for split: train
  Number of samples: 104175
  Total number of tokens: 39348648
  Average tokens per sample: 377.71680345572355
  Average tokens per prompt: 373.247813774898
  Average tokens per output: 4.468989680825534
  Standard deviation of tokens per sample: 99.39265197669258

Statistics for split: valid
  Number of samples: 26044
  Total number of tokens: 9840261
  Average tokens per sample: 377.8321686376901
  Average tokens per prompt: 373.36376900629705
  Average tokens per output: 4.468399631393027
  Standard deviation of tokens per sample: 99.85293082755062

Statistics for split: eval
  Number of samples: 11773
  Total number of tokens: 4647037
  Average tokens per sample: 394.71986749341715
  Average tokens per prompt: 390.3873269345112
  Average tokens per output: 4.332540558905971
  Standard deviation of tokens per sample: 109.88386266347925



In [4]:
from datasets import load_dataset

dataset = load_dataset("mandarjoshi/trivia_qa",'rc')

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

In [8]:
print(dataset['train'][0])

{'question': 'Which American-born Sinclair won the Nobel Prize for Literature in 1930?', 'question_id': 'tc_1', 'question_source': 'http://www.triviacountry.com/', 'entity_pages': {'doc_source': [], 'filename': [], 'title': [], 'wiki_context': []}, 'search_results': {'description': ['The Nobel Prize in Literature 1930 Sinclair ... The Nobel Prize in Literature 1930 was awarded to ... nobelprize.org/nobel_prizes/literature/laureates/1930/>', 'Why Don’t More Americans Win the Nobel Prize? By . ... When the Nobel Prize in Literature was awarded to Sinclair ... In 1930, Lewis told his Nobel audience that ...', '... Sauk Centre native Sinclair Lewis became the first American to be awarded a Nobel Prize for Literature. ... in 1930, Sauk Centre native Sinclair Lewis became the ...', 'Sinclair Lewis - Nobel Prize in Literature, 1930 (20 books) Type ... Literature Fiction Classics Short Stories Essays American literature Nobel Prize Uploaded: 2015 ...', "The Nobel Prize in Literature 1930 Sincl

In [5]:
from transformers import AutoTokenizer
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', trust_remote_code=True)

def count_tokens(text):
    return len(tokenizer.encode(text, add_special_tokens=False))

stats = {
    'split': [],
    'num_samples': [],
    'total_tokens': [],
    'average_tokens_per_sample': []
}

for split in dataset.keys():
    num_samples = len(dataset[split])
    total_tokens = 0
    
    for sample in dataset[split]:
        context = ' '.join(sample['search_results']['description']) + ' ' + ' '.join(sample['search_results']['search_context'])
        question_tokens = count_tokens(sample['question'])
        context_tokens = count_tokens(context)
        answer_tokens = count_tokens(sample['answer']['value']) if 'answer' in sample and 'value' in sample['answer'] else 0
        
        total_tokens += context_tokens + question_tokens + answer_tokens
    
    average_tokens = total_tokens / num_samples if num_samples else 0
    
    stats['split'].append(split)
    stats['num_samples'].append(num_samples)
    stats['total_tokens'].append(total_tokens)
    stats['average_tokens_per_sample'].append(average_tokens)

stats_df = pd.DataFrame(stats)
print(stats_df)


KeyError: 'context'

In [None]:
from datasets import load_dataset

dataset = load_dataset("facebook/mlqa",'mlqa.en.en')

Downloading data:   0%|          | 0.00/63.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/80069 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9927 [00:00<?, ? examples/s]

In [12]:
print(dataset['train'][0])

{'context': 'Architektonisch, die Schule hat einen katholischen Charakter. Die goldene Kuppel des Main Building ist eine goldene Statue der Jungfrau Maria. Direkt vor dem Haupt-Gebäude und vor der Tür, ist eine Kupfer-Statue von Christus mit Armen erhobenen mit der Legende Venite ad Me Omnes. Neben dem Main Building befindet sich die Basilika des Heiligen Herzens. Direkt hinter der Basilika befindet sich die Grotte, ein Marian-Ort des Gebets und der Reflexion. Es ist eine Replik der Grotte in Lourdes, Frankreich, wo die Jungfrau Maria angeblich 1858. zu Saint Bernadette Soubirous erschienen ist. Am Ende der Haupt-Fahrt (und in einer direkten Linie, die durch 3 Statuen und die Gold Kuppel verbindet), ist eine einfache, moderne Stein Statue von Mary.', 'question': 'Wem ist die Jungfrau Maria angeblich 1858 in Lourdes Frankreich erschienen?', 'answers': {'answer_start': [547], 'text': ['Saint Bernadette Soubirous']}, 'id': '5733be284776f41900661182'}


In [13]:
from transformers import AutoTokenizer
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', trust_remote_code=True)

def count_tokens(text):
    return len(tokenizer.tokenize(text))

stats = {
    'split': [],
    'num_samples': [],
    'total_tokens': [],
    'average_tokens_per_sample': []
}

for split in dataset.keys():
    num_samples = len(dataset[split])
    total_tokens = sum(
        count_tokens(sample['context']) + count_tokens(sample['question']) + count_tokens(sample['answers']['text'][0])
        for sample in dataset[split]
    )
    average_tokens = total_tokens / num_samples
    
    stats['split'].append(split)
    stats['num_samples'].append(num_samples)
    stats['total_tokens'].append(total_tokens)
    stats['average_tokens_per_sample'].append(average_tokens)

stats_df = pd.DataFrame(stats)
print(stats_df)

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


        split  num_samples  total_tokens  average_tokens_per_sample
0       train        80069      23827329                 297.584945
1  validation         9927       3023402                 304.563514


In [1]:
import pandas as pd
import subprocess

datasets = {
    "doc_check": "/data/share/project/smart_hospital/medical_dataset/doc_check/03_filtered_quality/dataset.jsonl",
    "grascco": "/data/share/project/smart_hospital/medical_dataset/grascco/03_filtered_quality/dataset.jsonl",
    "guidelines": "/data/share/project/smart_hospital/medical_dataset/guidelines/03_filtered_quality/dataset.jsonl",
    "kres": "/data/share/project/smart_hospital/medical_dataset/kres/03_filtered_quality/dataset.jsonl",
    "oscar_med_2040": "/data/share/project/smart_hospital/medical_dataset/oscar_med_2040/03_filtered_quality/dataset.jsonl",
    "oscar_med_2301": "/data/share/project/smart_hospital/medical_dataset/oscar_med_2301/03_filtered_quality/dataset.jsonl",
    "phd_theses": "/data/share/project/smart_hospital/medical_dataset/phd_theses/03_filtered_quality/dataset.jsonl",
    "phd_theses_sudo": "/data/share/project/smart_hospital/medical_dataset/phd_theses_sudo/03_filtered_quality/dataset.jsonl",
    "pubmed_abstracts": "/data/share/project/smart_hospital/medical_dataset/pubmed_abstracts/03_filtered_quality/dataset.jsonl",
    "springer_jsons": "/data/share/project/smart_hospital/medical_dataset/springer_jsons/03_filtered_quality/dataset.jsonl",
    "ufal_medizin": "/data/share/project/smart_hospital/medical_dataset/ufal_medizin/03_filtered_quality/dataset.jsonl",
    "wikipedia_medizin": "/data/share/project/smart_hospital/medical_dataset/wikipedia_medizin/03_filtered_quality/dataset.jsonl",
}

results = []

def run_wc(command, path):
    result = subprocess.run(command.split() + [path], capture_output=True, text=True)
    return int(result.stdout.split()[0])

for name, path in datasets.items():
    lines = run_wc('wc -l', path)
    words = run_wc('wc -w', path)
    characters = run_wc('wc -c', path)
    results.append({"Dataset": name, "Lines": lines, "Words": words, "Characters": characters})

df = pd.DataFrame(results)
print(df)


              Dataset    Lines      Words  Characters
0           doc_check    13110    8019547    71710420
1             grascco       62      33753      300219
2          guidelines      789    8919942    85503512
3                kres        3         60        2121
4      oscar_med_2040  1120689  580976204  5279432594
5      oscar_med_2301     1871     513132     4895319
6          phd_theses     6567   97073431   750519010
7     phd_theses_sudo     6567   93827818   758790213
8    pubmed_abstracts     4194     771699     8782598
9      springer_jsons    13963   44733736   366504420
10       ufal_medizin  1928645   86965612  1667292240
11  wikipedia_medizin    70482   41870648   382511745


In [None]:
"squad": {
    "file_name": "squad_demo.json",
    "formatting": "alpaca",
    "ranking": false,
    "columns": {
      "prompt": "question",
      "response": "answers",
      "history": "context"
    }
  },