## Merge Labels and Texts

In [1]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from utils.database import *
from utils.files import *
from tqdm import tqdm
from bson import ObjectId
import pandas as pd 
import numpy as np

## Load URLs

URLs per batch and topic.

In [3]:
df_labels = pd.read_json('../../data//raw/pages_with_labels.json', orient='records', lines=True)
df_labels.head()

Unnamed: 0,p_id,url,used_at,duration,yt_video_id,package_version,enddevice,batch,Group,start_date,...,start_intervention,start_knowledge,topic,series,annotation_type,good_for_training,good_for_augmentation,category,is_direct_topic_annotated,is_direct_topic_full
0,273726366948,mingle.respondi.de/,2023-06-13 14:12:16,393,,1210041502.0,mobile,15,Search,2023-06-13 14:12:00,...,2023-06-13 14:18:35,2023-06-14 18:09:40,kinder,,domain_discarded,False,True,other,,False
1,273726366948,mingle.respondi.de/,2023-06-13 20:09:47,2,,1210041502.0,mobile,15,Search,2023-06-13 14:12:00,...,2023-06-13 14:18:35,2023-06-14 18:09:40,kinder,,domain_discarded,False,True,other,,False
2,273746614716,mingle.respondi.de/,2023-06-14 16:19:10,492,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False
3,273746614716,mingle.respondi.de/,2023-06-14 16:28:00,1,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False
4,273746614716,mingle.respondi.de/,2023-06-14 16:36:39,13,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False


In [4]:
print("Number of pages: ", len(df_labels))

Number of pages:  761232


In [5]:
df_labels.columns

Index(['p_id', 'url', 'used_at', 'duration', 'yt_video_id', 'package_version',
       'enddevice', 'batch', 'Group', 'start_date', 't', 'wave', 'end_date',
       'start_intervention', 'start_knowledge', 'topic', 'series',
       'annotation_type', 'good_for_training', 'good_for_augmentation',
       'category', 'is_direct_topic_annotated', 'is_direct_topic_full'],
      dtype='object')

In [6]:
def get_freqs(dataframe, value_column):
    """Prints the frequency distribution of a specified value column, grouped by 'p_id', as a DataFrame."""
    unique_df = dataframe.drop_duplicates(subset=['p_id', value_column])
    value_counts = unique_df[value_column].value_counts().reset_index()
    value_counts.columns = [value_column, 'Frequency']
    return value_counts

In [7]:
get_freqs(df_labels, 'category')

Unnamed: 0,category,Frequency
0,other,867
1,search,731
2,news,643


In [8]:
get_freqs(df_labels, 'good_for_training')

Unnamed: 0,good_for_training,Frequency
0,False,870
1,True,690


In [9]:
get_freqs(df_labels, 'annotation_type')

Unnamed: 0,annotation_type,Frequency
0,domain_discarded,868
1,03.searches,654
2,04.urls-with-title,647
3,09.other,637
4,13.searches,294
5,06.news-wo-title,246
6,01.youtube,209
7,07.sm,205
8,10.new_news,146
9,12.surfaced_domains,142


## Remove Duplicates

In [10]:
# Removing duplicates based on 'view_url', 'batch_id', and 'topic'
print("Number of pages with labels: ", len(df_labels))
df_labels = df_labels.drop_duplicates(subset=['url', 'batch', 'topic'], keep='first')
print("Number of pages with labels after removing duplicates: ", len(df_labels))

Number of pages with labels:  761232
Number of pages with labels after removing duplicates:  288979


## Load Texts

In [11]:
from datasets import load_dataset, Features, Value, Dataset

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# File paths to load texts
file_path = f"../../data/raw/pages_with_text.json"
save_path = f"../../data/tmp/processed_dataset_all_topics_text_only"

In [13]:
# Define the schema 
features_schema = Features({
    '_id': Value('string'),
    'batch_id': Value('int64'),
    'domain': Value('string'),
    'encoding': Value('string'),
    'file_id': Value('string'),
    'lang': Value('string'),
    'text': Value('string'),
    'text_length': Value('int64'),
    'url': Value('string'),
    'word_count': Value('int64'),
    'view_url': Value('string'),
})

# Load the dataset from the JSON Lines file
dataset = load_dataset('json', data_files=file_path, split='train', features=features_schema, keep_in_memory=False)

In [14]:
# Processing the dataset in batches and applying transformations
def process_batch(batch):
    return batch # TODO: Returning the batch as is to include all attributes from the JSON objects

processed_dataset = dataset.map(process_batch, batched=True, batch_size=1000)

# Save the processed dataset to the specified folder
processed_dataset.save_to_disk(save_path)
print(f"Dataset saved to {save_path}")

Map:   0%|          | 0/3437316 [00:00<?, ? examples/s]

Map: 100%|██████████| 3437316/3437316 [00:25<00:00, 132807.96 examples/s]
Saving the dataset (43/43 shards): 100%|██████████| 3437316/3437316 [00:22<00:00, 152096.70 examples/s]

Dataset saved to ../../data/tmp/processed_dataset_all_topics_text_only





In [15]:
processed_dataset

Dataset({
    features: ['_id', 'batch_id', 'domain', 'encoding', 'file_id', 'lang', 'text', 'text_length', 'url', 'word_count', 'view_url'],
    num_rows: 3437316
})

In [16]:
processed_dataset[0]

{'_id': '648c2ad88e8cadbd29004dff',
 'batch_id': 15,
 'domain': 'microsoft.com',
 'encoding': 'utf-8',
 'file_id': '648c67b41a16a1878d01c628',
 'lang': 'en',
 'text': " Learn more: Accessibility with Microsoft Edge Your browser has been updated to the newest version. Microsoft Edge is the fast and secure browser that helps you protect your data and save time. Next Browse with confidence using Browser essentials Stay informed about your browser's performance and security with a single intuitive view that provides helpful suggestions for performance optimization and browser protection. Go to Microsoft Edge's toolbar and select the Browser essentials heartbeat icon to get started. Learn more Next Get quick answers with Bing Chat Whether you're shopping online, planning your next trip, or comparing options -- simply ask a question at the side of your screen, and have Bing do the work for you. Try now Learn more Next Translate the web to your language Microsoft Edge makes it easy to read we

## Merge Labels and Text

In [16]:
# Convert the processed dataset to a pandas DataFrame
df_texts = processed_dataset.to_pandas()
df_labels.rename(columns={'url': 'view_url', 'batch': 'batch_id'}, inplace=True)

# Merge the text and labels DataFrames
df_texts_and_labels = pd.merge(df_texts, df_labels, 
                               left_on=["batch_id", "view_url"], 
                               right_on=["batch_id", "view_url"], 
                               how='inner', suffixes=('_text', '_label'))

# Convert the 'good_for_training' column to boolean
df_texts_and_labels["is_topic"] = df_texts_and_labels["is_direct_topic_full"].map({'True': True, 'False': False})
df_texts_and_labels.columns

Index(['_id', 'batch_id', 'domain', 'encoding', 'file_id', 'lang', 'text',
       'text_length', 'url', 'word_count', 'view_url', 'p_id', 'used_at',
       'duration', 'yt_video_id', 'package_version', 'enddevice', 'Group',
       'start_date', 't', 'wave', 'end_date', 'start_intervention',
       'start_knowledge', 'topic', 'series', 'annotation_type',
       'good_for_training', 'good_for_augmentation', 'category',
       'is_direct_topic_annotated', 'is_direct_topic_full', 'is_topic'],
      dtype='object')

In [17]:
df_texts_and_labels.head()

Unnamed: 0,_id,batch_id,domain,encoding,file_id,lang,text,text_length,url,word_count,...,start_knowledge,topic,series,annotation_type,good_for_training,good_for_augmentation,category,is_direct_topic_annotated,is_direct_topic_full,is_topic
0,648c2ad88e8cadbd29004e47,15,amazon.de,UTF-8,648cbb291a16a1878d154a6b,de,Weiter ohne zu akzeptieren Wählen Sie Ihre Co...,5733,https://www.amazon.de/gp/cart/view.html?ref_=n...,734,...,2023-06-13 13:10:44,kinder,,domain_discarded,False,True,other,,False,False
1,648c2ad88e8cadbd29004e48,15,amazon.de,UTF-8,648ccf291a16a1878d19f814,de,Weiter ohne zu akzeptieren Wählen Sie Ihre Co...,5733,https://www.amazon.de/gp/cart/view.html/258-39...,734,...,2023-06-14 12:50:10,kinder,,domain_discarded,False,True,other,,False,False
2,648c2ad88e8cadbd29004e4f,15,t-online.de,utf-8,648c84911a16a1878d087f7b,de,Wetter DAX Telefonverzeichnisse Lotto Telekom ...,22912,https://www.t-online.de/logout/,2892,...,2023-06-13 21:04:03,kinder,5xxx,06.news-wo-title,True,True,news,0.0,False,False
3,648c2ad88e8cadbd29004e71,15,spiegel.de,utf-8,648c8f811a16a1878d0b124a,de,Zum Inhalt springen DER SPIEGEL Abonnement Ab...,32414,https://www.spiegel.de/,4494,...,2023-06-13 19:19:23,kinder,,domain_discarded,False,True,news,,False,False
4,648c2ad88e8cadbd29004ebd,15,amazon.de,ISO-8859-1,648cc6691a16a1878d17f9cd,de,Geben Sie die Zeichen unten ein Wir bitten um...,387,https://www.amazon.de/,58,...,2023-06-13 15:50:34,kinder,,domain_discarded,False,True,other,,False,False


In [18]:
len(df_texts_and_labels)

162111

## Remove Pages with same content across Batches

In [19]:
print("Number of pages with labels: ", len(df_texts_and_labels))

# Removing duplicates based on 'view_url', 'batch_id', and 'topic'
#df_texts_and_labels = df_texts_and_labels.drop_duplicates(subset=['view_url', 'text', 'topic'], keep='first')
df_texts_and_labels = df_texts_and_labels.drop_duplicates(subset=['view_url', 'topic'], keep='first') # Only keeping one text per page even if it changes

print("Number of pages with labels after removing duplicates: ", len(df_texts_and_labels))

Number of pages with labels:  162111
Number of pages with labels after removing duplicates:  160508


## Save Intermediate Datatset

In [20]:
# Convert the merged DataFrame back to a Dataset
dataset_texts_and_labels = Dataset.from_pandas(df_texts_and_labels)

In [21]:
save_path = f"../../data/tmp/processed_dataset_all_topics"
dataset_texts_and_labels.save_to_disk(save_path)

Saving the dataset (0/3 shards):   0%|          | 0/160508 [00:00<?, ? examples/s]

Saving the dataset (3/3 shards): 100%|██████████| 160508/160508 [00:03<00:00, 46411.78 examples/s]


In [22]:
dataset_texts_and_labels

Dataset({
    features: ['_id', 'batch_id', 'domain', 'encoding', 'file_id', 'lang', 'text', 'text_length', 'url', 'word_count', 'view_url', 'p_id', 'used_at', 'duration', 'yt_video_id', 'package_version', 'enddevice', 'Group', 'start_date', 't', 'wave', 'end_date', 'start_intervention', 'start_knowledge', 'topic', 'series', 'annotation_type', 'good_for_training', 'good_for_augmentation', 'category', 'is_direct_topic_annotated', 'is_direct_topic_full', 'is_topic', '__index_level_0__'],
    num_rows: 160508
})

## Split Dataset by Topic

In [23]:
topics = ["kinder", "energie", "cannabis"]

In [24]:
for topic in topics:
    
    print("Processing topic: ", topic)
    
    # Filter the dataset for the topic
    dataset_topic = dataset_texts_and_labels.filter(lambda example: example['topic'] == topic, num_proc=16)
    dataset_topic.save_to_disk(f"../../data/tmp/processed_dataset_{topic}")
    print("Number of examples in dataset_topic: ", len(dataset_topic))
    
    # Filter the dataset for the topic and positive labels
    dataset_topic_positive = dataset_topic.filter(lambda example: example['is_topic'], num_proc=16)
    dataset_topic_positive.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_positive")
    print("Number of examples in dataset_topic_positive: ", len(dataset_topic_positive))
    
    # Filter the dataset for the topic and negative labels
    dataset_topic_negative = dataset_topic.filter(lambda example: not example['is_topic'], num_proc=16)
    dataset_topic_negative.save_to_disk(f"../../data/tmp/processed_dataset_{topic}_negative")
    print("Number of examples in dataset_topic_negative: ", len(dataset_topic_negative))


Processing topic:  kinder


Filter (num_proc=16): 100%|██████████| 160508/160508 [01:03<00:00, 2520.07 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 58703/58703 [00:03<00:00, 16973.27 examples/s]

Number of examples in dataset_topic:  58703



Filter (num_proc=16): 100%|██████████| 58703/58703 [01:02<00:00, 942.01 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 181/181 [00:00<00:00, 5959.78 examples/s]

Number of examples in dataset_topic_positive:  181



Filter (num_proc=16): 100%|██████████| 58703/58703 [01:02<00:00, 937.50 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 58522/58522 [00:03<00:00, 17289.78 examples/s]

Number of examples in dataset_topic_negative:  58522
Processing topic:  energie



Filter (num_proc=16): 100%|██████████| 160508/160508 [01:01<00:00, 2593.98 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 52165/52165 [00:03<00:00, 17331.77 examples/s]

Number of examples in dataset_topic:  52165



Filter (num_proc=16): 100%|██████████| 52165/52165 [01:01<00:00, 842.35 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 190/190 [00:00<00:00, 8590.53 examples/s]

Number of examples in dataset_topic_positive:  190



Filter (num_proc=16): 100%|██████████| 52165/52165 [01:01<00:00, 842.14 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 51975/51975 [00:02<00:00, 17559.55 examples/s]

Number of examples in dataset_topic_negative:  51975
Processing topic:  cannabis



Filter (num_proc=16): 100%|██████████| 160508/160508 [01:01<00:00, 2609.94 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 49640/49640 [00:02<00:00, 17518.17 examples/s]

Number of examples in dataset_topic:  49640



Filter (num_proc=16): 100%|██████████| 49640/49640 [01:01<00:00, 806.01 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 105/105 [00:00<00:00, 7382.48 examples/s]

Number of examples in dataset_topic_positive:  105



Filter (num_proc=16): 100%|██████████| 49640/49640 [01:01<00:00, 803.87 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 49535/49535 [00:02<00:00, 17491.99 examples/s]

Number of examples in dataset_topic_negative:  49535



