In [14]:
from datasets import load_dataset
import matplotlib.pyplot as plt
import os

dataset_path = os.path.abspath("active-projects/textbook-question-generation/data/multirc-v2")

dataset_files = {
    "train": os.path.join(dataset_path, "train_456-fixedlds.json"),
    "valid": os.path.join(dataset_path, "dev_83-fixedlds.json"),
}

print("Dataset files:", dataset_files)  # Debugging

ds = load_dataset("json", data_files=dataset_files)

def get_sequence_lengths(ds):
    paragraph_lengths = [len(item["paragraph"]["text"].split()) for item in ds]
    question_lengths = [len(q["question"].split()) for item in ds for q in item["paragraph"]["questions"]]
    answer_lengths = [len(ans["text"].split()) for item in ds for q in item["paragraph"]["questions"] for ans in q["answers"]]
    return paragraph_lengths, question_lengths, answer_lengths

def plot_histogram(data, title, xlabel, bins=30):
    plt.figure(figsize=(10, 5))
    plt.hist(data, bins=bins, edgecolor="black")
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")
    plt.title(title)
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.show()

plot_histogram(dev_paragraphs, "Histogram of Paragraph Lengths (Dev)", "Number of Words")
plot_histogram(dev_questions, "Histogram of Question Lengths (Dev)", "Number of Words")
plot_histogram(dev_answers, "Histogram of Answer Lengths (Dev)", "Number of Words")

plot_histogram(train_paragraphs, "Histogram of Paragraph Lengths (Train)", "Number of Words")
plot_histogram(train_questions, "Histogram of Question Lengths (Train)", "Number of Words")
plot_histogram(train_answers, "Histogram of Answer Lengths (Train)", "Number of Words") 

Dataset files: {'train': '/home/jovyan/active-projects/textbook-question-generation/data/multirc-v2/active-projects/textbook-question-generation/data/multirc-v2/train_456-fixedlds.json', 'valid': '/home/jovyan/active-projects/textbook-question-generation/data/multirc-v2/active-projects/textbook-question-generation/data/multirc-v2/dev_83-fixedlds.json'}


FileNotFoundError: Unable to find '/home/jovyan/active-projects/textbook-question-generation/data/multirc-v2/active-projects/textbook-question-generation/data/multirc-v2/train_456-fixedlds.json'

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import os


dataset_path = os.path.abspath("active-projects/textbook-question-generation/data/multirc-v2")
dataset_files = {
    "train": os.path.join(dataset_path, "train_456-fixedlds.json"),
    "valid": os.path.join(dataset_path, "dev_83-fixedlds.json"),
}

print("Dataset files:", dataset_files)  # Debugging

ds = load_dataset("json", data_files=dataset_files)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def get_token_lengths(ds_split, tokenizer):
    paragraph_token_lengths = [
        len(tokenizer.tokenize(item["paragraph"]["text"]))
        for item in ds_split
    ]
    question_token_lengths = [
        len(tokenizer.tokenize(q["question"]))
        for item in ds_split
        for q in item["paragraph"]["questions"]
    ]
    answer_token_lengths = [
        len(tokenizer.tokenize(ans["text"]))
        for item in ds_split
        for q in item["paragraph"]["questions"]
        for ans in q["answers"]
    ]
    return paragraph_token_lengths, question_token_lengths, answer_token_lengths

def plot_histogram(data, title, xlabel, bins=30):
    plt.figure(figsize=(10, 5))
    plt.hist(data, bins=bins, edgecolor="black")
    plt.xlabel(xlabel)
    plt.ylabel("Frequency")
    plt.title(title)
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.show()

train_paragraph_tokens, train_question_tokens, train_answer_tokens = get_token_lengths(ds["train"], tokenizer)
dev_paragraph_tokens, dev_question_tokens, dev_answer_tokens = get_token_lengths(ds["valid"], tokenizer)

plot_histogram(dev_paragraph_tokens, "Histogram of Paragraph Token Lengths (Dev)", "Number of Tokens")
plot_histogram(dev_question_tokens, "Histogram of Question Token Lengths (Dev)", "Number of Tokens")
plot_histogram(dev_answer_tokens, "Histogram of Answer Token Lengths (Dev)", "Number of Tokens")

plot_histogram(train_paragraph_tokens, "Histogram of Paragraph Token Lengths (Train)", "Number of Tokens")
plot_histogram(train_question_tokens, "Histogram of Question Token Lengths (Train)", "Number of Tokens")
plot_histogram(train_answer_tokens, "Histogram of Answer Token Lengths (Train)", "Number of Tokens")


Dataset files: {'train': '/home/jovyan/active-projects/textbook-question-generation/src/active-projects/textbook-question-generation/data/multirc-v2/train_456-fixedlds.json', 'valid': '/home/jovyan/active-projects/textbook-question-generation/src/active-projects/textbook-question-generation/data/multirc-v2/dev_83-fixedlds.json'}
