## Sample Negative URLs Stratified by Domain

In [1]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from datasets import load_from_disk, Dataset, ClassLabel, Value, Features, concatenate_datasets, DatasetDict
from datasets import concatenate_datasets
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np
import torch
from collections import Counter
import random
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import logging
from transformers import logging as transformers_logging

# Set the logging level to error for transformers, which will suppress warnings
transformers_logging.set_verbosity_error()


**Load Examples:**

In [4]:
topic = "cannabis" #"energie" #"kinder" "cannabis"

In [5]:
#dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed")
dataset_split = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_split")
dataset = dataset_split["train"]
print(dataset)
print(dataset[1])

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count'],
    num_rows: 3858
})
{'_id': '64a0946b749484eec84dbbf1', 'batch_id': 16, 'domain': 't-online.de', 'view_url': 'email.t-online.de/em', 'lang': 'de', 'text': 'Wetter DAX Telefonverzeichnisse Lotto Telekom Services Telekom Hilfe & Service Frag Magenta Kundencenter Freemail MagentaCloud Tarife & Produkte PUR-Abo Login Suchen E-Mail Login Politik Deutschland Ausland Corona-Krise Tagesanbruch Ukraine Regional Berlin Hamburg München Köln Frankfurt Alle Städte Sport Bundesliga 2. Bundesliga Zweikampf der Woche Fußball Champions League FC Bayern Newsticker Formel 1 Was macht …? Mehr Sport Liveticker Ergebnisse Anzeigen Sportwetten Wirtschaft & Finanzen Aktuelles Börse Immobilien Die Anleger Ratgeber Versicherungen Publikumspreis Anzeigen Immobilien-Teilverkauf Ver

In [6]:
# Count the occurrences of each label
label_counts = Counter(dataset['label'])
print("Class frequencies:", label_counts)

# Find the minimum count
min_count = min(label_counts.values())
print("Minimum class frequency:", min_count)

Class frequencies: Counter({0: 3653, 1: 205})
Minimum class frequency: 205


### Stratified Sampling

In [7]:
dataset_negative_examples = dataset.filter(lambda example: example['label'] == 0)
dataset_positive_examples = dataset.filter(lambda example: example['label'] == 1)

In [8]:
print("Number of distinct domains", len(set(dataset_negative_examples["domain"])))

Number of distinct domains 758


In [9]:
N = 128  # For example, to keep the top  most frequent domains

# Calculate domain frequencies
domain_counts = Counter(dataset_negative_examples['domain'])
top_domains = set([domain for domain, count in domain_counts.most_common(N)])

In [10]:
print(top_domains)

{'hkk.de', 'bz-berlin.de', 'daserste.de', 'sn-online.de', 'guildwars2.com', 'sueddeutsche.de', 'computerbild.de', 'haller-kreisblatt.de', 'brigitte.de', 'digitalfernsehen.de', 'hallo-muenchen.de', 'barmer.de', 'fnp.de', 'infranken.de', 'bundesgesundheitsministerium.de', 'ard-text.de', 'bbv-net.de', 'vice.com', 'hitchecker.de', 'quoka.de', 'happy-size.de', 'ardmediathek.de', 'taz.de', 'change.org', 'tk.de', 'mz.de', '24hamburg.de', 'manager-magazin.de', 'snagtights.de', 'arcamax.com', 'gutefrage.net', 'ndr.de', 'derwesten.de', 'faz.net', 'fluter.de', 'saechsische.de', 'rga.de', 'merkur.de', 'presseportal.de', 'bzga.de', 'mainpost.de', 'tag24.de', 'wdr.de', 'tz.de', 'welt.de', 'web.de', 'gegen-hartz.de', 'spiegel.de', 'msn.com', 'berliner-kurier.de', 'waz.de', 'tagesschau.de', 'seeandso.com', 'nn.de', 'freenet.de', 'n-tv.de', 'neuepresse.de', 'radio.de', 'lausitznews.de', 'suedkurier.de', 'bundesregierung.de', 'mdr.de', 'berliner-zeitung.de', 'focus.de', 'nrz.de', 'lvz.de', 'rp-online.de

In [11]:
# Mark all other domains as "other"
def mark_other_domains(example):
    if example['domain'] not in top_domains:
        example['domain'] = 'other'
    return example

# Apply the transformation to the dataset
dataset_negative_examples = dataset_negative_examples.map(mark_other_domains)

In [12]:
# Convert to Pandas DataFrame
df_dataset = dataset_negative_examples.to_pandas()  
sample_size = min_count
print("Sample size:", sample_size)

# Perform stratified sampling 
discarded_sample , stratified_sample = train_test_split(df_dataset, test_size=sample_size, stratify=df_dataset['domain'], random_state=42)
stratified_sample.reset_index(drop=True, inplace=True)
discarded_sample.reset_index(drop=True, inplace=True)

Sample size: 205


In [13]:
# Convert the stratified sample back to a Hugging Face Dataset
stratified_dataset = Dataset.from_pandas(stratified_sample)
discarded_dataset = Dataset.from_pandas(discarded_sample)

# Cast the dataset to the new features
new_features = dataset_positive_examples.features.copy()
stratified_dataset = stratified_dataset.cast(new_features)
discarded_dataset = discarded_dataset.cast(new_features)
stratified_dataset

Casting the dataset: 100%|██████████| 205/205 [00:00<00:00, 15540.36 examples/s]
Casting the dataset: 100%|██████████| 3448/3448 [00:00<00:00, 61130.03 examples/s]


Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count'],
    num_rows: 205
})

In [14]:
stratified_sample.head()

Unnamed: 0,_id,batch_id,domain,view_url,lang,text,text_length,word_count,topic,category,good_for_training,good_for_augmentation,annotation_type,is_topic,label,token_count
0,64a0946d749484eec854d7f7,16,t-online.de,www.t-online.de/nachrichten/deutschland/innenp...,de,Wetter DAX Telefonverzeichnisse Lotto Telekom ...,10515,1387,cannabis,news,True,True,04.urls-with-title,False,0,2213
1,64a0946f749484eec859a5ef,16,bild.de,www.bild.de/regional/thueringen/thueringen-akt...,de,Weiter zum Hauptinhalt ↵ BILD Logo TV-Stream ...,8408,1141,cannabis,news,True,True,04.urls-with-title,False,0,1751
2,64a0946c749484eec85274ac,16,other,magazin.kuechenfinder.com/baugleiche-geschirrs...,de,Küchenstudios Preisrechner Küchenformen Front...,14403,2022,cannabis,other,True,True,04.urls-with-title,False,0,3180
3,64a0946e749484eec8578dfa,16,zdf.de,www.zdf.de/filme,de,Zum Hauptinhalt springen Zur Suche springen Z...,16890,2420,cannabis,news,True,True,06.news-wo-title,False,0,3566
4,64a0946d749484eec8561c0b,16,tvnow.de,tvnow.de/shows/bella-italia-19832,de,RTL+ Musik Stöbern Einloggen Jetzt testen Übe...,5611,837,cannabis,other,True,True,04.urls-with-title,False,0,1249


In [15]:
balanced_dataset = concatenate_datasets([stratified_dataset, dataset_positive_examples])

label_counts = Counter(balanced_dataset['label'])
print("Class frequencies:", label_counts)

Class frequencies: Counter({0: 205, 1: 205})


**Split in Test and Train Dataset**

In [16]:
print("Number of all annotated samples:", len(balanced_dataset))

dataset_pos = balanced_dataset.filter(lambda example: example['label'] > 0)
print("Number of positive annotated samples:", len(dataset_pos))

dataset_buff = balanced_dataset.filter(lambda example: example['category'] == "buff")
print("Number of manually annotated samples:", len(dataset_buff))

dataset_not_buff = balanced_dataset.filter(lambda example: example['category'] != "buff")
print("Number of regular annotated samples:", len(dataset_not_buff))

Number of all annotated samples: 410
Number of positive annotated samples: 205
Number of manually annotated samples: 150
Number of regular annotated samples: 260


In [17]:
def train_test_split_balanced(dataset: Dataset, n: int, label_column='label', random_state=None):
    """Randomly sample n/2 datapoints from each class for the test set and return the train and test splits. """
    
    # If n is odd, increment by 1 to make it even
    if n % 2 != 0:
        n += 1
    
    if random_state is not None:
        random.seed(random_state)
    
    # Aggregate indices by class
    class_indices = {label: [i for i, example in enumerate(dataset) if example[label_column] == label] 
                     for label in set(dataset[label_column])}
    
    # Ensure there are enough samples in each class
    for label, indices in class_indices.items():
        if len(indices) < n // 2:
            raise ValueError(f"Not enough samples in class {label} to sample {n // 2} examples.")
    
    # Randomly sample n/2 indices from each class for the test set
    test_indices = []
    for indices in class_indices.values():
        test_indices.extend(random.sample(indices, n // 2))
    
    # Determine train indices by finding the difference between all indices and the test ones
    all_indices = set(range(len(dataset)))
    test_set = set(test_indices)
    train_indices = list(all_indices - test_set)
    
    # Select the train and test indices to create new datasets
    train_dataset = dataset.select(train_indices)
    test_dataset = dataset.select(test_indices)
    
    return DatasetDict({'train': train_dataset, 'test': test_dataset})

In [18]:
test_size_int = round(len(balanced_dataset) * 0.1)
print("Test size:", test_size_int)

Test size: 41


In [19]:
split_datasets = train_test_split_balanced(dataset_not_buff, n=test_size_int, label_column='label')
split_datasets['train'] = concatenate_datasets([split_datasets['train'], dataset_buff])
print("Size of training set:", len(split_datasets['train']))
print("Size of testing set:", len(split_datasets['test']))

Size of training set: 368
Size of testing set: 42


In [20]:
# Count the occurrences of each label
label_counts = Counter(split_datasets["test"]['label'])
print("Class frequencies:", label_counts)

Class frequencies: Counter({0: 21, 1: 21})


In [21]:
split_datasets 

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count'],
        num_rows: 368
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count'],
        num_rows: 42
    })
})

**Save Splits:**

In [22]:
dataset_split["train"] = balanced_dataset

In [23]:
dataset_split.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_stratified")

Saving the dataset (1/1 shards): 100%|██████████| 410/410 [00:00<00:00, 12795.98 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46/46 [00:00<00:00, 4456.97 examples/s]


In [24]:
## Save with discarded URLs
dataset_split["holdout"] = discarded_dataset
dataset_split.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_stratified_holdout")

Saving the dataset (1/1 shards): 100%|██████████| 410/410 [00:00<00:00, 16616.72 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46/46 [00:00<00:00, 4286.75 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3448/3448 [00:00<00:00, 44993.12 examples/s]


In [25]:
dataset_extented = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_extended_filtered")

In [26]:
dataset_split["extended"] = dataset_extented
dataset_split.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_stratified_extended")

Saving the dataset (1/1 shards): 100%|██████████| 410/410 [00:00<00:00, 14444.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46/46 [00:00<00:00, 4255.73 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3448/3448 [00:00<00:00, 33631.06 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 44432/44432 [00:00<00:00, 75962.63 examples/s] 
