## Add URLs as Chunks

In [1]:
from tqdm import tqdm
from bson import ObjectId
import pandas as pd 
import numpy as np
from datasets import Dataset, ClassLabel, Value, Features
from datasets import concatenate_datasets

  from .autonotebook import tqdm as notebook_tqdm


## Paramaters

In [2]:
SAMPLING = "random" # "random", "stratified", "clustered", "shared_domain"
SUFFIX = "_extended" #"", "_holdout", "_extended"
MAX_CONTENT_LENGTH = 384 # 496, 192
TOPIC = "cannabis" #"energie" #"kinder" "cannabis"

## Load URLs

URLs per batch and topic.

In [3]:
df_labels = pd.read_json('../../data/raw/pages_with_labels.json', orient='records', lines=True)
df_labels.head()

Unnamed: 0,p_id,url,used_at,duration,yt_video_id,package_version,enddevice,batch,Group,start_date,...,start_intervention,start_knowledge,topic,series,annotation_type,good_for_training,good_for_augmentation,category,is_direct_topic_annotated,is_direct_topic_full
0,273726366948,mingle.respondi.de/,2023-06-13 14:12:16,393,,1210041502.0,mobile,15,Search,2023-06-13 14:12:00,...,2023-06-13 14:18:35,2023-06-14 18:09:40,kinder,,domain_discarded,False,True,other,,False
1,273726366948,mingle.respondi.de/,2023-06-13 20:09:47,2,,1210041502.0,mobile,15,Search,2023-06-13 14:12:00,...,2023-06-13 14:18:35,2023-06-14 18:09:40,kinder,,domain_discarded,False,True,other,,False
2,273746614716,mingle.respondi.de/,2023-06-14 16:19:10,492,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False
3,273746614716,mingle.respondi.de/,2023-06-14 16:28:00,1,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False
4,273746614716,mingle.respondi.de/,2023-06-14 16:36:39,13,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False


In [4]:
df_labels["is_topic"] = df_labels["is_direct_topic_full"].map({'True': True, 'False': False})

In [5]:
df_labels.columns

Index(['p_id', 'url', 'used_at', 'duration', 'yt_video_id', 'package_version',
       'enddevice', 'batch', 'Group', 'start_date', 't', 'wave', 'end_date',
       'start_intervention', 'start_knowledge', 'topic', 'series',
       'annotation_type', 'good_for_training', 'good_for_augmentation',
       'category', 'is_direct_topic_annotated', 'is_direct_topic_full',
       'is_topic'],
      dtype='object')

In [6]:
print("Number of pages: ", len(df_labels))

Number of pages:  761232


In [7]:
df_labels_topic = df_labels[df_labels.topic == TOPIC]

In [8]:
print("Number of pages of this topic: ", len(df_labels_topic))

Number of pages of this topic:  234120


**Remove Duplicates:**

In [9]:
# Removing duplicates based on 'view_url', 'batch_id', and 'topic'
print("Number of pages with labels: ", len(df_labels_topic))
df_labels_topic = df_labels_topic.drop_duplicates(subset=['url', 'batch', 'topic'], keep='first')
print("Number of pages with labels after removing duplicates: ", len(df_labels_topic))

Number of pages with labels:  234120
Number of pages with labels after removing duplicates:  88922


**Convert to Huggingface Dataset:**

In [10]:
from urllib.parse import urlparse


def extract_domain(url: str) -> str:
    """
    Extracts the domain from a given URL. Prepends 'http://' if no scheme is found to ensure correct parsing. Returns an empty string if the URL is invalid or empty.
    """
    if not url:
        return ""
    
    if not urlparse(url).scheme:
        url = "http://" + url
    
    return urlparse(url).netloc.split('/')[0]

print(extract_domain("www.example.com/path/to/resource"))  # Output: www.example.com
print(extract_domain(""))

www.example.com



In [11]:
transformed_data = {
    '_id': df_labels_topic['p_id'].apply(lambda x: 'dummy_id_' + str(x)),
    'batch_id': df_labels_topic['batch'],  
    'domain': df_labels_topic['url'].apply(extract_domain),  # Extract domain from URL
    'view_url': df_labels_topic['url'], 
    'lang': 'na', 
    'text': "",
    'text_length': 0,
    'word_count': 0,
    'topic': df_labels_topic['topic'],  
    'category': df_labels_topic['category'],
    'good_for_training': df_labels_topic['good_for_training'],
    'good_for_augmentation': df_labels_topic['good_for_augmentation'],
    'annotation_type': df_labels_topic['annotation_type'],
    'is_topic': df_labels_topic['is_topic'],
    'token_count': 0,
    'chunk_id': 0,
    #'label': df_labels_topic['label'],
}

df_urls = pd.DataFrame(transformed_data)
dataset_urls = Dataset.from_pandas(df_urls)

In [12]:
# Convert boolean labels to integers (True to 1, False to 0)
dataset_urls = dataset_urls.map(lambda example: {'label': int(example['is_topic'])})

# Define a ClassLabel feature for the converted integer labels
class_label_feature = ClassLabel(num_classes=2, names=['False', 'True'])

# Update the features of the dataset
new_features = dataset_urls.features.copy()
new_features['label'] = class_label_feature

# Cast the dataset to the new features
dataset = dataset_urls.cast(new_features)

Map:   0%|          | 0/88922 [00:00<?, ? examples/s]

Map: 100%|██████████| 88922/88922 [00:16<00:00, 5531.18 examples/s]
Casting the dataset: 100%|██████████| 88922/88922 [00:00<00:00, 1482812.51 examples/s]


In [13]:
dataset_urls

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'token_count', 'chunk_id', '__index_level_0__', 'label'],
    num_rows: 88922
})

## Load Chunkified Texts

In [14]:
from datasets import load_from_disk

In [15]:
dataset = load_from_disk(
    f"../../data_ccu/tmp/processed_dataset_{TOPIC}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}/processed_dataset_{TOPIC}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}")

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 33702
    })
    extended: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length

In [17]:
dataset["train"].features

{'_id': Value(dtype='string', id=None),
 'batch_id': Value(dtype='int64', id=None),
 'domain': Value(dtype='string', id=None),
 'view_url': Value(dtype='string', id=None),
 'lang': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'text_length': Value(dtype='int64', id=None),
 'word_count': Value(dtype='int64', id=None),
 'topic': Value(dtype='string', id=None),
 'category': Value(dtype='string', id=None),
 'good_for_training': Value(dtype='string', id=None),
 'good_for_augmentation': Value(dtype='string', id=None),
 'annotation_type': Value(dtype='string', id=None),
 'is_topic': Value(dtype='bool', id=None),
 'label': Value(dtype='int64', id=None),
 'token_count': Value(dtype='int64', id=None),
 'chunk_id': Value(dtype='int64', id=None)}

In [18]:
dataset_urls.features

{'_id': Value(dtype='string', id=None),
 'batch_id': Value(dtype='int64', id=None),
 'domain': Value(dtype='string', id=None),
 'view_url': Value(dtype='string', id=None),
 'lang': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None),
 'text_length': Value(dtype='int64', id=None),
 'word_count': Value(dtype='int64', id=None),
 'topic': Value(dtype='string', id=None),
 'category': Value(dtype='string', id=None),
 'good_for_training': Value(dtype='string', id=None),
 'good_for_augmentation': Value(dtype='string', id=None),
 'annotation_type': Value(dtype='string', id=None),
 'is_topic': Value(dtype='bool', id=None),
 'token_count': Value(dtype='int64', id=None),
 'chunk_id': Value(dtype='int64', id=None),
 '__index_level_0__': Value(dtype='int64', id=None),
 'label': Value(dtype='int64', id=None)}

In [19]:
def select_columns(dataset: Dataset, columns_to_keep: list) -> Dataset:
    """Returns a new dataset containing only the specified columns."""
    # Directly compute columns to remove and apply removal
    return dataset.remove_columns([col for col in dataset.column_names if col not in columns_to_keep])


In [20]:
columns = set(dataset["train"].features.keys())
dataset_urls = dataset_urls.select_columns(columns)

In [21]:
# Access the schema (features) of the existing dataset
schema = dataset["train"].features
dataset_urls = dataset_urls.cast(schema)

Casting the dataset:   0%|          | 0/88922 [00:00<?, ? examples/s]

Casting the dataset: 100%|██████████| 88922/88922 [00:00<00:00, 820731.64 examples/s]


In [22]:
dataset_urls

Dataset({
    features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
    num_rows: 88922
})

## Add URLs as Chunks

**Determine URLs to add:**

In [23]:
# Concatenate the view_url columns from all splits
view_urls = pd.concat([
    pd.Series(dataset['train']['view_url']),
    pd.Series(dataset['test']['view_url']),
    pd.Series(dataset['holdout']['view_url']),
    pd.Series(dataset['extended']['view_url'])
])

# Get unique view_urls
unique_view_urls = view_urls.unique()
unique_view_urls_list = unique_view_urls.tolist()

print(f"Number of unique view_urls: {len(unique_view_urls_list)}")
print("Some examples of unique view_urls:", unique_view_urls_list[:3])


Number of unique view_urls: 48147
Some examples of unique view_urls: ['www.quoka.de/wellness-gesundheit/massage/duesseldorf/sc_15_ct_119205_page_2.html', 'https://www.oberpfalzecho.de/beitrag/cannabislegalisierung-no-go-fuer-polizei-im-landkreis-new', 'https://www1.wdr.de/nachrichten/cannabis-freigabe-lauterbach-legalisierung-100.html']


In [24]:
dataset_urls = dataset_urls.filter(lambda example: example['view_url'] not in unique_view_urls_list)
print("Number of pages after filtering out pages already in the dataset: ", len(dataset_urls))

Filter: 100%|██████████| 88922/88922 [00:56<00:00, 1568.83 examples/s]

Number of pages after filtering out pages already in the dataset:  40518





**Extend Dataset:**

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 33702
    })
    extended: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length

In [26]:
dataset_urls_holdout = dataset_urls.filter(lambda example: example['good_for_training'] == "True")

dataset_urls_extended = dataset_urls.filter(lambda example: example['good_for_training'] == "False")

dataset["holdout_url"] = dataset_urls_holdout
dataset["extended_url"] = dataset_urls_extended

#dataset_combined_tmp = concatenate_datasets([dataset["train"], dataset_urls])

Filter:   0%|          | 0/40518 [00:00<?, ? examples/s]

Filter: 100%|██████████| 40518/40518 [00:01<00:00, 20485.55 examples/s]
Filter: 100%|██████████| 40518/40518 [00:02<00:00, 19592.22 examples/s]


In [27]:
dataset

DatasetDict({
    train: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 3815
    })
    test: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 507
    })
    holdout: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length', 'word_count', 'topic', 'category', 'good_for_training', 'good_for_augmentation', 'annotation_type', 'is_topic', 'label', 'token_count', 'chunk_id'],
        num_rows: 33702
    })
    extended: Dataset({
        features: ['_id', 'batch_id', 'domain', 'view_url', 'lang', 'text', 'text_length

## Save extended Dataset

In [28]:
dataset.save_to_disk(f"../../data_ccu/tmp/processed_dataset_{TOPIC}_buffed_chunkified_{SAMPLING}{SUFFIX}_{MAX_CONTENT_LENGTH}_with_urls")

Saving the dataset (1/1 shards): 100%|██████████| 3815/3815 [00:00<00:00, 300837.95 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 507/507 [00:00<00:00, 48571.58 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 33702/33702 [00:00<00:00, 517209.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 224737/224737 [00:00<00:00, 633894.66 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1094/1094 [00:00<00:00, 38358.58 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 39424/39424 [00:00<00:00, 57530.26 examples/s]
