## Describe Annotations

In [1]:
import os
import sys

# Needed to import modules from parent directory
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
from utils.files import *
from tqdm import tqdm
import pandas as pd

## Read CSV File

In [3]:
path_to_annotations = "../../data/20h_experiment/5200-df-full-annotated-feb-and-may-julian-version.csv"
annotations = read_csv_as_dict(path_to_annotations)

In [4]:
print('Number of annotations: {}'.format(len(annotations)))
print('Columns: {}'.format(list(annotations[0].keys())))  
print('Example: {}'.format(annotations[0]))

Number of annotations: 761232
Columns: ['p_id', 'url', 'used_at', 'duration', 'yt_video_id', 'package_version', 'enddevice', 'batch', 'Group', 'start_date', 't', 'wave', 'end_date', 'start_intervention', 'start_knowledge', 'topic', 'series', 'annotation_type', 'good_for_training', 'good_for_augmentation', 'category', 'is_direct_topic_annotated', 'is_direct_topic_full']
Example: {'p_id': '273726366948', 'url': 'mingle.respondi.de/', 'used_at': '2023-06-13 14:12:16', 'duration': '393', 'yt_video_id': '', 'package_version': '1210041502.0', 'enddevice': 'mobile', 'batch': '15', 'Group': 'Search', 'start_date': '2023-06-13 14:12:00', 't': '1', 'wave': '1. Kinder', 'end_date': '2023-06-14 18:11:45', 'start_intervention': '2023-06-13 14:18:35', 'start_knowledge': '2023-06-14 18:09:40', 'topic': 'kinder', 'series': '', 'annotation_type': 'domain_discarded', 'good_for_training': 'False', 'good_for_augmentation': 'True', 'category': 'other', 'is_direct_topic_annotated': '', 'is_direct_topic_full

In [5]:
df_annotations = pd.DataFrame(annotations)

In [6]:
df_annotations.head()

Unnamed: 0,p_id,url,used_at,duration,yt_video_id,package_version,enddevice,batch,Group,start_date,...,start_intervention,start_knowledge,topic,series,annotation_type,good_for_training,good_for_augmentation,category,is_direct_topic_annotated,is_direct_topic_full
0,273726366948,mingle.respondi.de/,2023-06-13 14:12:16,393,,1210041502.0,mobile,15,Search,2023-06-13 14:12:00,...,2023-06-13 14:18:35,2023-06-14 18:09:40,kinder,,domain_discarded,False,True,other,,False
1,273726366948,mingle.respondi.de/,2023-06-13 20:09:47,2,,1210041502.0,mobile,15,Search,2023-06-13 14:12:00,...,2023-06-13 14:18:35,2023-06-14 18:09:40,kinder,,domain_discarded,False,True,other,,False
2,273746614716,mingle.respondi.de/,2023-06-14 16:19:10,492,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False
3,273746614716,mingle.respondi.de/,2023-06-14 16:28:00,1,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False
4,273746614716,mingle.respondi.de/,2023-06-14 16:36:39,13,,1210041502.0,mobile,15,Control,2023-06-14 16:19:00,...,2023-06-14 16:26:33,2023-06-15 15:34:00,kinder,,domain_discarded,False,True,other,,False


## Basic Statistics

In [7]:
print(df_annotations.describe())

                p_id                      url              used_at duration  \
count         761232                   761232               761232   761232   
unique           872                   267332               430016     1647   
top     323733401748  navigator.gmx.net/xxxxx  2023-06-12 18:43:57        2   
freq            9462                     5347                   12   140332   

       yt_video_id package_version enddevice   batch   Group  \
count       761232          761232    761232  761232  761232   
unique        3264               9         2       3       3   
top                                  desktop      16  Search   
freq        746752          751889    682812  506276  258601   

                 start_date  ...   start_intervention      start_knowledge  \
count                761232  ...               761232               761232   
unique                 1406  ...                 1952                 1954   
top     2023-06-26 13:55:00  ...  2023-06-13 12:1

**Batches:**

In [8]:
batch_counts = df_annotations['batch'].value_counts()
print(batch_counts)

batch
16    506276
15    248030
17      6926
Name: count, dtype: int64


**Topics:**

In [9]:
topic_counts = df_annotations['topic'].value_counts()
print(topic_counts)

topic
kinder      278295
energie     248817
cannabis    234120
Name: count, dtype: int64


**Labels:**

In [10]:
label_counts = df_annotations['is_direct_topic_full'].value_counts()
print(label_counts)

is_direct_topic_full
False    757130
True       4102
Name: count, dtype: int64


**Labels per Topic:**

In [11]:
value_counts_per_topic = df_annotations.groupby('topic')['is_direct_topic_full'].value_counts()
print(value_counts_per_topic)

topic     is_direct_topic_full
cannabis  False                   233094
          True                      1026
energie   False                   247470
          True                      1347
kinder    False                   276566
          True                      1729
Name: count, dtype: int64


**Unique URLs per Topic and Label:**

In [12]:
unique_url_counts = df_annotations.groupby(['topic', 'is_direct_topic_full'])['url'].nunique()
print(unique_url_counts)

topic     is_direct_topic_full
cannabis  False                    87927
          True                       309
energie   False                    93136
          True                       479
kinder    False                   103861
          True                       536
Name: url, dtype: int64


In [13]:
print(unique_url_counts.reset_index(name='unique_urls_count'))


      topic is_direct_topic_full  unique_urls_count
0  cannabis                False              87927
1  cannabis                 True                309
2   energie                False              93136
3   energie                 True                479
4    kinder                False             103861
5    kinder                 True                536


In [14]:
unique_url_counts = df_annotations.groupby(['topic', 'is_direct_topic_full', 'batch'])['url'].nunique()
print(unique_url_counts)

topic     is_direct_topic_full  batch
cannabis  False                 16       85495
                                17        3110
          True                  16         276
                                17          41
energie   False                 16       93136
          True                  16         479
kinder    False                 15       92277
                                16       13615
          True                  15         477
                                16          73
Name: url, dtype: int64


**Positive Examples:**

In [15]:
filtered_df = df_annotations[df_annotations['is_direct_topic_full'] == "True"]
grouped_urls = filtered_df.groupby('topic').apply(lambda x: x['url'].drop_duplicates().head(5))
grouped_urls = grouped_urls.reset_index(level=-1, drop=True)

In [16]:
# Convert the grouped URLs into a more readable format
for topic, urls in grouped_urls.groupby(level=0):
    print(f"Topic: {topic}")
    for url in urls:
        print(f"- {url}")
    print() 


Topic: cannabis
- google.com/search?q=cannabis+legalisierung&sxsrf=APwXEdf8-EAKfbipdjHncqqoDDuxeH2pnw%3A1687786323773&source=hp&ei=U5OZZPXiLKjtkdUPyfeewAU&iflsig=AOEireoAAAAAZJmhY8KH5MtrCBZA6rjWTd9k9uo5Uclz&oq=cannabis&gs_lcp=Cgdnd3Mtd2l6EAEYADIOCAAQgAQQsQMQgwEQyQMyCAgAEIAEEJIDMgsIABCABBCxAxCDATIICAAQgAQQsQMyCAgAEIAEELEDMg4ILhCDARDUAhCxAxCABDILCAAQgAQQsQMQgwEyCAgAEIAEELEDMggIABCABBCxAzIFCAAQgAQ6BwgjEOoCECc6EQguEIAEELEDEIMBEMcBENEDOgUILhCABDoLCAAQigUQsQMQgwE6CwguEIAEEMcBENEDOgQIIxAnOgcIIxCKBRAnOgsILhCKBRCxAxCDAToRCC4QgwEQxwEQsQMQ0QMQgAQ6CwguEIAEELEDEIMBOggILhCABBCxA1DMKFjSRGCKbGgBcAB4AIABpAGIAboHkgEDMy41mAEAoAEBsAEK&sclient=gws-wiz
- bundesgesundheitsministerium.de/presse/pressemitteilungen/eckpunkte-cannabis-12-04-23.html
- aussiedlerbote.de/2023/04/predstavlen-zakonoproekt-o-legalizacii-kannabisa/
- google.com/search?q=Gesetzen+zur+Cannabis-Legalisierung&oq=Gesetzen+zur+Cannabis-Legalisierung&gs_lcrp=EgZjaHJvbWUyBggAEEUYOdIBBzU4MmowajmoAgCwAgA&sourceid=chrome&ie=UTF-8
- google.com/sea

In [17]:
#filtered_df = df_annotations[df_annotations['is_direct_topic_full'] == "True" and ]
#grouped_urls = filtered_df.groupby('topic').apply(lambda x: x['url'].drop_duplicates().head(5))