## Count Webpages per Split

In [1]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from datasets import load_from_disk, Dataset, ClassLabel, Value, Features, concatenate_datasets
from transformers import AutoTokenizer
import pandas as pd 
import numpy as np
import torch
from collections import Counter
import random
from tabulate import tabulate

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
TOPICS = ["energie", "kinder", "cannabis"] #"energie" "kinder"   "cannabis" 
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"

In [4]:
results = []

for topic in TOPICS:
    dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_{SAMPLING}{SUFFIX}")
    dataset["holdout"] = concatenate_datasets([dataset["holdout"], dataset["test"]])
    
    # Collect the counts for each split
    counts = {
        "Topic": topic,
        "Train": dataset['train'].num_rows if 'train' in dataset else 0,
        "Test": dataset['test'].num_rows if 'test' in dataset else 0,
        "Holdout": dataset['holdout'].num_rows if 'holdout' in dataset else 0,
        "Extended": dataset['extended'].num_rows if 'extended' in dataset else 0
    }
    
    # Append the counts to the results list
    results.append(counts)

In [5]:
# Print the HTML table
df_counts = pd.DataFrame(results)
table_html = tabulate(df_counts, headers='keys', tablefmt='html')
display(table_html)

Unnamed: 0,Topic,Train,Test,Holdout,Extended
0,energie,408,46,4210,45925
1,kinder,384,44,3766,53253
2,cannabis,410,46,3494,44432


In [9]:
# Initialize a list to store the results
results = []

# Loop over each topic
for topic in TOPICS:
    # Load the dataset
    dataset_path = f"../../data/tmp/processed_dataset_{topic}_buffed_{SAMPLING}{SUFFIX}"
    dataset = load_from_disk(dataset_path)
    
    # Combine holdout and test datasets
    if 'holdout' in dataset and 'test' in dataset:
        dataset["holdout"] = concatenate_datasets([dataset["holdout"], dataset["test"]])
    
    # Initialize counts
    counts = {"Topic": topic}
    
    # Collect the counts for each split
    for split in ['train', 'test', 'holdout', 'extended']:
        if split in dataset:
            label_counts = dataset[split].to_pandas()['label'].value_counts().to_dict()
            counts[f'{split}_related'] = label_counts.get(1, 0)
            counts[f'{split}_unrelated'] = label_counts.get(0, 0)
        else:
            counts[f'{split}_related'] = 0
            counts[f'{split}_unrelated'] = 0
    
    # Append the counts to the results list
    results.append(counts)

In [10]:
# Print the HTML table
df_counts = pd.DataFrame(results)
table_html = tabulate(df_counts, headers='keys', tablefmt='html')
display(table_html)

Unnamed: 0,Topic,train_related,train_unrelated,test_related,test_unrelated,holdout_related,holdout_unrelated,extended_related,extended_unrelated
0,energie,204,204,23,23,23,4187,31,45894
1,kinder,192,192,22,22,22,3744,45,53208
2,cannabis,205,205,23,23,23,3471,29,44403
