## Combine Dataset with Training Data Buff

In [1]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from collections import Counter
from datasets import load_from_disk, Dataset, ClassLabel, Value, Features, load_dataset
from transformers import AutoTokenizer
import numpy as np
import pandas as pd
import random
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import logging
from transformers import logging as transformers_logging

# Set the logging level to error for transformers, which will suppress warnings
transformers_logging.set_verbosity_error()


**Load Examples:**

In [4]:
topic = "cannabis" #"energie" #"kinder" "cannabis"

In [5]:
columns = ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation','annotation_type', 'is_topic']

In [6]:
def select_columns(dataset: Dataset, columns_to_keep: list) -> Dataset:
    """Returns a new dataset containing only the specified columns."""
    # Directly compute columns to remove and apply removal
    return dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])


In [7]:
dataset_orig = load_from_disk(f"../../data/tmp/processed_dataset_{topic}")
dataset_orig = dataset_orig.select_columns(columns)

print(dataset_orig)
print(dataset_orig[1])

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic'],
    num_rows: 49640
})
{'_id': '64a0946b749484eec84dbcd0', 'batch_id': 16, 'domain': 'bilendi.com', 'view_url': "surveyd.bilendi.com/survey/selfserve/53b/2306124#!'", 'lang': 'de', 'text': " Die URL oben enthält nicht die für diese Umfrage erforderlichen Informationen. Die korrekte URL finden Sie in Ihrer Einladungs-E-Mail. Wenn die Probleme weiterhin auftreten, wenden Sie sich bitte an die dort angegebene Person. ERROR: SE-02 Variable list has invalid value '' ", 'text_length': 276, 'word_count': 43, 'topic': 'cannabis', 'category': 'other', 'good_for_training': 'False', 'good_for_augmentation': 'True', 'annotation_type': 'domain_discarded', 'is_topic': False}


In [8]:
dataset_buff = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buff")
dataset_buff

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topics', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic'],
    num_rows: 150
})

In [9]:
dataset_buff = load_from_disk(f"../../data/tmp/processed_dataset_{topic}_buff")
dataset_buff = dataset_buff.select_columns(columns)
print(dataset_buff)
print(dataset_buff[0])

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic'],
    num_rows: 150
})
{'_id': 'dummy_id_0', 'batch_id': 99999, 'domain': 'www.wa.de', 'view_url': 'https://www.wa.de/nordrhein-westfalen/cannabis-legalisierung-nrw-hanf-gras-ibbenbueren-entkriminalisierung-anbau-drogen-polizei-kritik-92064527.html', 'lang': 'de', 'text': 'Legalisierung von Cannabis: Hanf aus NRW könnte Problem lösen Hamm NRW Lokales Politik Sport Stellenmarkt ePaper Kategorien Hamm NRW Münster Essen Lokales Werne Bönen Bergkamen Drensteinfurt Welver Sport Hamm Bönen Drensteinfurt Fußball WA-Tipp Tabellen Bundesliga-Tippspiel Verbraucher Rückrufe & Warnungen Testberichte Supermärkte & Discounter Abo Print-Angebote Digital-Angebote Kombi-Angebote Abo-Services Wirtschaft Politik Karl Lauterbach Fotos & Videos Kultur Veranstaltungen Leben Auto Reise Karriere Gesundhei

**Merge Datasets:**

In [10]:
from datasets import concatenate_datasets

In [11]:
dataset = concatenate_datasets([dataset_orig, dataset_buff])

In [12]:
dataset[0]

{'_id': '64a0946b749484eec84dbcc0',
 'batch_id': 16,
 'domain': 'bilendi.com',
 'view_url': 'surveyd.bilendi.com/survey/selfserve/53b/2306124',
 'lang': 'de',
 'text': " Die URL oben enthält nicht die für diese Umfrage erforderlichen Informationen. Die korrekte URL finden Sie in Ihrer Einladungs-E-Mail. Wenn die Probleme weiterhin auftreten, wenden Sie sich bitte an die dort angegebene Person. ERROR: SE-02 Variable list has invalid value '' ",
 'text_length': 276,
 'word_count': 43,
 'topic': 'cannabis',
 'category': 'other',
 'good_for_training': 'False',
 'good_for_augmentation': 'True',
 'annotation_type': 'domain_discarded',
 'is_topic': False}

In [13]:
# Convert boolean labels to integers (True to 1, False to 0)
dataset = dataset.map(lambda example: {'label': int(example['is_topic'])})

# Define a ClassLabel feature for the converted integer labels
class_label_feature = ClassLabel(num_classes=2, names=['False', 'True'])

# Update the features of the dataset
new_features = dataset.features.copy()
new_features['label'] = class_label_feature

# Cast the dataset to the new features
dataset = dataset.cast(new_features)

# Verify the dataset
print(dataset)

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label'],
    num_rows: 49975
})


In [14]:
# Access the ClassLabel feature for the 'label' column
class_label_feature = dataset.features['label']

# Get the names (string representations)
label_names = class_label_feature.names
print("Label Names:", label_names)

# Create a mapping from integer indices to names
label_mapping = {index: class_label_feature.int2str(index) for index in range(class_label_feature.num_classes)}
print("Label Mapping:", label_mapping)

# To access the name for a specific label index
label_index = 0  # Example index
label_name = class_label_feature.int2str(label_index)
print(f"Label for index {label_index}:", label_name)


Label Names: ['False', 'True']
Label Mapping: {0: 'False', 1: 'True'}
Label for index 0: False


In [15]:
from urllib.parse import urlparse

def extract_domain_from_view_url(example):
    
    # Check if 'view_url' is empty
    if not example['view_url']:
        # Set 'view_url' to the value of 'domain'
        example['view_url'] = example['domain']
        
    # Ensure the URL has a protocol for urlparse to work correctly
    view_url = example['view_url']
    if not urlparse(view_url).scheme:
        view_url = "http://" + view_url  # Prepend with a default protocol
    
    # Extract domain using urlparse on the modified URL
    parsed_url = urlparse(view_url)
    
    # Sometimes the domain might be in 'path' if 'netloc' is empty (missing protocol)
    domain = parsed_url.netloc if parsed_url.netloc else parsed_url.path
    
    # Splitting by "/" in case the URL without protocol directly starts with domain name
    domain = domain.split('/')[0]
    
    # Update the 'domain' field
    example['domain'] = domain
    return example

# Apply the function to each example in the dataset
updated_dataset = dataset.map(extract_domain_from_view_url)

In [16]:
dataset[0]

{'_id': '64a0946b749484eec84dbcc0',
 'batch_id': 16,
 'domain': 'bilendi.com',
 'view_url': 'surveyd.bilendi.com/survey/selfserve/53b/2306124',
 'lang': 'de',
 'text': " Die URL oben enthält nicht die für diese Umfrage erforderlichen Informationen. Die korrekte URL finden Sie in Ihrer Einladungs-E-Mail. Wenn die Probleme weiterhin auftreten, wenden Sie sich bitte an die dort angegebene Person. ERROR: SE-02 Variable list has invalid value '' ",
 'text_length': 276,
 'word_count': 43,
 'topic': 'cannabis',
 'category': 'other',
 'good_for_training': 'False',
 'good_for_augmentation': 'True',
 'annotation_type': 'domain_discarded',
 'is_topic': False,
 'label': 0}

In [17]:
len(dataset)

49975

In [18]:
# Count exaples with label = 1 
label_count = Counter(dataset_orig['is_topic'])
print("Label Count orig:", label_count)

label_count = Counter(dataset_buff['is_topic'])
print("Label Count buff:", label_count)

label_count = Counter(dataset['label'])
print("Label Count combined:", label_count)

Label Count orig: Counter({False: 49535, True: 105})
Label Count buff: Counter({True: 150})
Label Count combined: Counter({0: 49714, 1: 261})


## Save to Disk

In [19]:
file_path = f"../../data/tmp/processed_dataset_{topic}_buffed"
dataset.save_to_disk(file_path)
#dataset.to_json(file_path)

Saving the dataset (1/1 shards): 100%|██████████| 49975/49975 [00:00<00:00, 56777.38 examples/s] 
