## Sample Negative URLs of same URLS

In [1]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from datasets import load_from_disk, Dataset, ClassLabel, Value, Features, concatenate_datasets, DatasetDict
from transformers import AutoTokenizer
from datasets import concatenate_datasets
import pandas as pd 
import numpy as np
import torch
from collections import Counter
import random
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import logging
from transformers import logging as transformers_logging

# Set the logging level to error for transformers, which will suppress warnings
transformers_logging.set_verbosity_error()

**Load Examples:**

In [4]:
topic = "kinder" #"energie" #"kinder" "cannabis"

In [5]:
#dataset = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed")
dataset_split = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_split")
dataset = dataset_split["train"]
print(dataset)
print(dataset[1])

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count'],
    num_rows: 4106
})
{'_id': '648c2ad88e8cadbd29004e1d', 'batch_id': 15, 'domain': 't-online.de', 'view_url': 'email.t-online.de/em', 'lang': 'de', 'text': 'Wetter DAX Telefonverzeichnisse Lotto Telekom Services Telekom Hilfe & Service Frag Magenta Kundencenter Freemail MagentaCloud Tarife & Produkte PUR-Abo Login Suchen E-Mail Login Politik Deutschland Ausland Corona-Krise Tagesanbruch Ukraine Regional Berlin Hamburg München Köln Frankfurt Alle Städte Sport Bundesliga 2. Bundesliga Zweikampf der Woche Fußball Champions League FC Bayern Newsticker Formel 1 Was macht …? Special Olympics Mehr Sport Liveticker Ergebnisse Anzeigen Sportwetten Wirtschaft & Finanzen Aktuelles Börse Immobilien Die Anleger Ratgeber Versicherungen Publikumspreis Anzeigen Immobilie

In [6]:
# Count the occurrences of each label
label_counts = Counter(dataset['label'])
print("Class frequencies:", label_counts)

# Find the minimum count
min_count = min(label_counts.values())
print("Minimum class frequency:", min_count)

Class frequencies: Counter({0: 3914, 1: 192})
Minimum class frequency: 192


## Find URLs among both classes

In [7]:
dataset_negative_examples = dataset.filter(lambda example: example['label'] == 0)
dataset_positive_examples = dataset.filter(lambda example: example['label'] == 1)

In [8]:
# Filter for positive and negative classes
positive_samples = [sample for sample in dataset if sample['label'] == 1]
negative_samples = [sample for sample in dataset if sample['label'] == 0]

# Extract distinct URLs from positive  and negative samples
positive_urls = set(sample['domain'] for sample in positive_samples)
negative_urls = set(sample['domain'] for sample in negative_samples)

# Find intersection
common_urls_count = len(positive_urls.intersection(negative_urls))

print("Domains in positive class:", len(positive_urls))
print("Domains in negative class:", len(negative_urls))
print("Domains in both classes:", common_urls_count)

Domains in positive class: 89
Domains in negative class: 838
Domains in both classes: 32


In [9]:
hard_negative_samples = dataset_negative_examples.filter(lambda x: (x['domain'] in positive_urls))
easy_negative_samples = dataset_negative_examples.filter(lambda x: (x['domain'] not in positive_urls))

print("Hard negative samples:", len(hard_negative_samples))
print("Easy negative samples:", len(easy_negative_samples))

Hard negative samples: 875
Easy negative samples: 3039


In [10]:
# Count the occurrences of domain in the hard negative samples
hard_negative_counts = Counter(hard_negative_samples['domain'])
print("Hard negative class frequencies:", hard_negative_counts)

Hard negative class frequencies: Counter({'t-online.de': 389, 'wikipedia.org': 184, 'zdf.de': 46, 'google.com': 35, 'arbeitsagentur.de': 29, 'tagesschau.de': 27, 'zeit.de': 21, 'sueddeutsche.de': 18, 'faz.net': 17, 'mainpost.de': 15, 'merkur.de': 14, 'derwesten.de': 12, 'rp-online.de': 10, 'taz.de': 8, 'mdr.de': 6, 'fr.de': 4, 'tagesspiegel.de': 4, 'deutschlandfunkkultur.de': 4, 'bmfsfj.de': 4, 'augsburger-allgemeine.de': 4, 'vdk.de': 4, 'haufe.de': 3, 'bundestag.de': 3, 'rnd.de': 3, 'cdu.de': 2, 'finanztip.de': 2, 'gruene.de': 2, 'ifo.de': 1, 'kinder-grund-sicherung.de': 1, 'paritaet-bw.de': 1, 'savethechildren.de': 1, 'der-paritaetische.de': 1})


In [11]:
# # Count the occurrences of urls in the hard negative samples
# hard_negative_counts = Counter(hard_negative_samples['view_url'])
# print("Hard negative class frequencies:", hard_negative_counts)

### Sample Among shared URLs

In [12]:
def sample_randomly(dataset: Dataset, n: int, random_seed: int = 42):
    """ Sample 'n' examples randomly from the dataset """
    
    # Set the seed for reproducibility
    dataset = dataset.shuffle(seed=random_seed)
    n = min(n, len(dataset))
    
    # Sample the first 'n' examples for the balanced dataset
    sampled_dataset = dataset.select(range(n))
    discarded_dataset = dataset.select(range(n, len(dataset)))
    
    return sampled_dataset, discarded_dataset

In [13]:
sampled_dataset, discarded_dataset = sample_randomly(hard_negative_samples, min_count, random_seed=42)

print(f"Sampled dataset size: {len(sampled_dataset)}")
print(f"Discarded dataset size: {len(discarded_dataset)}")

Sampled dataset size: 192
Discarded dataset size: 683


In [14]:
discarded_dataset = concatenate_datasets([discarded_dataset, easy_negative_samples])
balanced_dataset = concatenate_datasets([sampled_dataset, dataset_positive_examples])

In [15]:
label_counts = Counter(balanced_dataset['label'])
print("Class frequencies:", label_counts)

Class frequencies: Counter({0: 192, 1: 192})


**Split in Test and Train Dataset**

In [16]:
print("Number of all annotated samples:", len(balanced_dataset))

dataset_pos = balanced_dataset.filter(lambda example: example['label'] > 0)
print("Number of positive annotated samples:", len(dataset_pos))

dataset_buff = balanced_dataset.filter(lambda example: example['category'] == "buff")
print("Number of manually annotated samples:", len(dataset_buff))

dataset_not_buff = balanced_dataset.filter(lambda example: example['category'] != "buff")
print("Number of regular annotated samples:", len(dataset_not_buff))

Number of all annotated samples: 384
Number of positive annotated samples: 192
Number of manually annotated samples: 76
Number of regular annotated samples: 308


In [17]:
def train_test_split_balanced(dataset: Dataset, n: int, label_column='label', random_state=None):
    """Randomly sample n/2 datapoints from each class for the test set and return the train and test splits. """
    
    # If n is odd, increment by 1 to make it even
    if n % 2 != 0:
        n += 1
    
    if random_state is not None:
        random.seed(random_state)
    
    # Aggregate indices by class
    class_indices = {label: [i for i, example in enumerate(dataset) if example[label_column] == label] 
                     for label in set(dataset[label_column])}
    
    # Ensure there are enough samples in each class
    for label, indices in class_indices.items():
        if len(indices) < n // 2:
            raise ValueError(f"Not enough samples in class {label} to sample {n // 2} examples.")
    
    # Randomly sample n/2 indices from each class for the test set
    test_indices = []
    for indices in class_indices.values():
        test_indices.extend(random.sample(indices, n // 2))
    
    # Determine train indices by finding the difference between all indices and the test ones
    all_indices = set(range(len(dataset)))
    test_set = set(test_indices)
    train_indices = list(all_indices - test_set)
    
    # Select the train and test indices to create new datasets
    train_dataset = dataset.select(train_indices)
    test_dataset = dataset.select(test_indices)
    
    return DatasetDict({'train': train_dataset, 'test': test_dataset})

In [18]:
test_size_int = round(len(balanced_dataset) * 0.1)
print("Test size:", test_size_int)

Test size: 38


In [19]:
split_datasets = train_test_split_balanced(dataset_not_buff, n=test_size_int, label_column='label')
split_datasets['train'] = concatenate_datasets([split_datasets['train'], dataset_buff])
print("Size of training set:", len(split_datasets['train']))
print("Size of testing set:", len(split_datasets['test']))

Size of training set: 346
Size of testing set: 38


In [20]:
# Count the occurrences of each label
label_counts = Counter(split_datasets["test"]['label'])
print("Class frequencies:", label_counts)

Class frequencies: Counter({0: 19, 1: 19})


In [21]:
split_datasets 

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count'],
        num_rows: 346
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count'],
        num_rows: 38
    })
})

**Save Splits:**

In [22]:
dataset_split["train"] = balanced_dataset

In [23]:
dataset_split.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_shared_domain")

Saving the dataset (1/1 shards): 100%|██████████| 384/384 [00:00<00:00, 15175.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 44/44 [00:00<00:00, 5235.44 examples/s]


In [24]:
## Save with discarded URLs
dataset_split["holdout"] = discarded_dataset
dataset_split.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_shared_domain_holdout")

Saving the dataset (1/1 shards): 100%|██████████| 384/384 [00:00<00:00, 13533.08 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 44/44 [00:00<00:00, 5288.55 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3722/3722 [00:00<00:00, 24906.31 examples/s]


In [25]:
dataset_extented = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_extended_filtered")

In [26]:
dataset_split["extended"] = dataset_extented
dataset_split.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_buffed_shared_domain_extended")

Saving the dataset (1/1 shards): 100%|██████████| 384/384 [00:00<00:00, 10056.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 44/44 [00:00<00:00, 4121.61 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3722/3722 [00:00<00:00, 24931.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 53253/53253 [00:00<00:00, 88249.80 examples/s] 
