In [1]:
import os
import sys
import re
import time
import string
import multiprocessing as mp

from tinydb import TinyDB, Query
from tinydb import where as tinydb_where
from tinydb.storages import JSONStorage
from tinydb.middlewares import CachingMiddleware

import stanza
from textacy import preprocessing as textacy_preprocessing
import ftfy
from corpus_toolkit import corpus_tools as ct
from somajo import SoMaJo
import pandas as pd

import simhash

from IPython.display import clear_output
from tqdm.notebook import tqdm

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_colwidth", 200)

In [2]:
os.getcwd()

'/mnt/f/PrivacyPolicy-Projects/HbbTV/code'

In [3]:
measurement = "study"
data_dir= "../data/Measurement_" + measurement
db_path = "../data/Measurement_" + measurement + "/hbbtv_policies_database_" + measurement + ".json"

In [4]:
list_of_token_classes = {'URL',
                         'XML_entity',
                         'XML_tag',
                         'abbreviation',
                         'action_word',
                         'amount',
                         'date',
                         'email_address',
                         'emoticon',
                         'hashtag',
                         'measurement',
                         'mention',
                         'number',
                         'ordinal',
                         'regular',
                         'semester',
                         'symbol',
                         'time'}

# Dictionary Analysis

## Load Datasets

### Load dataset with extracted text

In [5]:
def load_policies(language):
    """
    Load policies in the passed language and their metadata
    """

    def load_metadata_with_FN_labels_corrected():
        df_metadata = pd.read_csv("../data/Measurement_" + measurement + "/hbbtv_" + measurement + "_metadata_FN_labels_corrected.tsv", sep="\t")
        return df_metadata

    df_metadata = load_metadata_with_FN_labels_corrected()
    df_metadata = df_metadata.loc[(df_metadata["determined_language"] == language) & (df_metadata["corrected_predicted_label"] == 1) & (df_metadata["sha1"] != "ab2e094a92d2bed3188fcb17eb8c7a0baabbc66c")]
    print(f"df_metadata_{language}: {df_metadata.shape}", flush=True)

    list_of_text_ids = df_metadata["text_id"].tolist()

    db = TinyDB(db_path, storage=CachingMiddleware(JSONStorage))
    policies_table = db.table("policies")

    list_of_policy_dicts = policies_table.search(Query().text_id.one_of(list_of_text_ids))

    df = pd.DataFrame(list_of_policy_dicts)
    df.drop_duplicates(subset=["sha1", "channel"], inplace=True)
    df.drop(["text_canola", "text_readability", "text_markdown"], axis=1, inplace=True)
    df.drop_duplicates(subset=["sha1", "channel"]).to_json("../data/Measurement_" + measurement + "/hbbtv_privacy_policies_" + measurement + "_" + language + "_deduplicated.json", orient="records", lines=True, force_ascii=False)

    print(f"Loaded {df.shape[0]} deduplicated policies for language {language}", flush=True)
    db.close()

    return df

def load_policy_multilingual(sha1):

    def load_metadata_with_FN_labels_corrected():
        df_metadata = pd.read_csv("../data/Measurement_" + measurement + "/hbbtv_" + measurement + "_metadata_FN_labels_corrected.tsv", sep="\t")
        return df_metadata

    df_metadata = load_metadata_with_FN_labels_corrected()
    df_metadata = df_metadata.loc[(df_metadata["corrected_predicted_label"] == 1) & (df_metadata["sha1"] == sha1)]
    print(f"df_metadata_multilingual: {df_metadata.shape}", flush=True)

    list_of_text_ids = df_metadata["text_id"].tolist()

    db = TinyDB(db_path, storage=CachingMiddleware(JSONStorage))
    policies_table = db.table("policies")

    list_of_policy_dicts = policies_table.search(Query().text_id.one_of(list_of_text_ids))

    df = pd.DataFrame(list_of_policy_dicts)
    df.drop_duplicates(subset=["sha1", "channel"], inplace=True)
    df.drop(["text_canola", "text_readability", "text_markdown"], axis=1, inplace=True)
    df.drop_duplicates(subset=["sha1", "channel"]).to_json("../data/Measurement_" + measurement + "/hbbtv_privacy_policies_" + measurement + "_multilingual_deduplicated.json", orient="records", lines=True, force_ascii=False)

    print(f"Loaded {df.shape[0]} deduplicated policies for multilingual and sha1 {sha1}", flush=True)
    db.close()

    return df


def text_cleaner(text):
    """
    Customized code to preprocess the extracted texts lightly: 
    Note that the preprocessing does not lead to changing the semtantic of the texts as this would lead to the extraction of incorrect sentence embeddings using BERT.
    """
    text = textacy_preprocessing.normalize.bullet_points(text)
    text = textacy_preprocessing.normalize.unicode(text)
    text = ftfy.fix_text(text) # fix unicode errors and other special characters
    text = text.replace("\n", "\n\n") # for splitting by linebreak
    text = textacy_preprocessing.normalize.hyphenated_words(text) # other hyphenated words
    text = textacy_preprocessing.normalize.whitespace(text)
    text = re.sub(" +", " ", "".join(x if x.isprintable() or x in string.whitespace else " " for x in text))
    return text


def lemmatize_text(text, language):
    nlp = stanza.Pipeline(lang=language, processors='tokenize,mwt,pos,lemma', verbose=False, logging_level="ERROR")
    text = text_cleaner(text)
    doc = nlp(text)
    lemmas = [word.lemma.lower() for t in doc.iter_tokens() for word in t.words]
    return " ".join(lemmas)

def somajo_tokenizer_multiple_texts(list_of_texts, language):
    somajo_languages = {"de": "de_CMC", "en": "en_PTB"}
    tokenizer = SoMaJo(somajo_languages[language], split_sentences=False)
    list_of_lists_of_tokenized_texts = tokenizer.tokenize_text(list_of_texts, parallel=1)
    print(list_of_lists_of_tokenized_texts)
    list_of_lists_of_tokenized_texts = [[token.text for token in tokens if token.token_class in ["regular", "abbreviation", "number_compound", "number", "URL", "amount", "data", "email_address", "hashtag", "measurement", "mention", "ordinal", "semester", "time"]] for tokens in list_of_lists_of_tokenized_texts]
    # list_of_lists_of_tokenized_texts = [[token.text for token in tokens if token.token_class not in ["symbol"]] for tokens in list_of_lists_of_tokenized_texts]
    return list_of_lists_of_tokenized_texts

In [6]:
df_de = load_policies("de")
df_en = load_policies("en")
df_multi = load_policy_multilingual("ab2e094a92d2bed3188fcb17eb8c7a0baabbc66c")

df_metadata_de: (2652, 33)
Loaded 55 deduplicated policies for language de
df_metadata_en: (3, 33)
Loaded 1 deduplicated policies for language en
df_metadata_multilingual: (1, 33)
Loaded 1 deduplicated policies for multilingual and sha1 ab2e094a92d2bed3188fcb17eb8c7a0baabbc66c


Lemmatize the policies of both languages

In [7]:
df_en["text_lemmatized"] = df_en.apply(lambda x: lemmatize_text(x["text"], "en"), axis=1)

In [8]:
df_en.head()

Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,text,text_lemmatized
0,20283,73529,7_1525_2917_0_1073_28221_1,http://itv-tp.ard.de/backend/public/api/v1/portal/configs.json/privacy_text,ard.de,utf-8,study,4263,application/json; charset=utf-8,200,34.120.66.190,ARD-TEST-1,2023-09-18T01:19:51.275Z,3,False,True,True,True,True,False,False,False,False,f26130e58d4684b357e869667ae48d26fb5bab24,16829530362413899507,"HbbTV ARD Test Portal\n\n\nThe protection of personal data which is collected, processed and used by HbbTV applications is an important concern for the ARD. This document tells you which data the ...","hbbtv ard test portal \n the protection of personal datum which be collect , process and use by hbbtv application be a important concern for the ard . this document tell you which datum the ard te..."


In [9]:
df_de["text_lemmatized"] = df_de.apply(lambda x: lemmatize_text(x["text"], "de"), axis=1)

In [10]:
df_de.head()

Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,text,text_lemmatized
0,10,45022,7_1547_3051_0_1115_13141_1,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2072,text/plain,200,52.174.244.81,AstroTV HD,2023-11-02T14:03:58.445Z,1,False,True,True,True,False,True,False,False,False,30b89b7f86e2bb5271350e4822b3854218032087,17821622590391302043,I. Name und Anschrift des Verantwortlichen\nDer Verantwortliche im Sinne der Datenschutz-Grundverordnung und anderer nationaler Datenschutzgesetze der Mitgliedsstaaten sowie sonstiger datenschutzr...,i. name und anschrift der verantwortlich der verantwortliche in der sinn der datenschutz - grundverordnung und anderer national datenschutzgesetze der mitgliedsstaat sowie sonstig datenschutzrecht...
1,37,48950,7_1411_1574_0_33_661_133,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2079,text/plain,200,52.174.244.81,AstroTV,2023-09-16T20:14:43.209Z,3,False,True,True,True,False,True,False,False,False,a29243ef94d0c3ff76d4dd7b06ad2d978c532cc5,18398083342023604123,I. Name und Anschrift des Verantwortlichen\nDer Verantwortliche im Sinne der Datenschutz-Grundverordnung und anderer nationaler Datenschutzgesetze der Mitgliedsstaaten sowie sonstiger datenschutzr...,i. name und anschrift der verantwortlich der verantwortliche in der sinn der datenschutz - grundverordnung und anderer national datenschutzgesetze der mitgliedsstaat sowie sonstig datenschutzrecht...
2,228,91483,7_1415_1868_0_7_61_133,http://nickelodeon-at.gbucket.at/index.html?nnat-sd-sat,gbucket.at,utf-8,study,20518,application/vnd.hbbtv.xhtml+xml,200,18.155.145.96,Nick/Comedy Central Austria,2023-09-20T14:16:56.671Z,3,True,True,True,False,False,True,False,False,True,6bfe65e1409e274da745de7c286acf4654806cd3,11972179217896911647,E-Mail: oba@at.goldbach.com\nDatenschutzerklärung\nFür die Messung des Nutzungsverhaltens für den Verein Arbeitsgemeinschaft Teletest (kurz AGTT; Details siehe agtt.at/hbb-Messung) wird Ihre Einwi...,e - mail : oooon der datenschutzerklärung für der messung der nutzungsverhaltens für der verein arbeitsgemeinschaft teletest ( kurz agtt ; detail sehen agtt.at/hbb - messung ) werden ihr einwillig...
3,274,69891,7_1411_1570_0_33_32_133,https://hbbtvapp.sonnenklar.tv/sat/index.html,sonnenklar.tv,utf-8,study,5771,application/vnd.hbbtv.xhtml+xml,200,51.77.80.89,Sonnenklar TV,2023-09-17T20:00:22.149Z,3,True,False,False,False,True,False,False,False,False,79c8fdae91cecaac7509db09eb757d178600bc7d,6146571371789340086,sonnenklar.TV 1 Aktuelle Angebote 2 Unsere Topseller 3 Sendung verpasst 4 Reisevideos 5 Favoriten 6 Bestellnummer 7 Reisebüros Ausblenden Info Home\nLi...,sonnenklar.tv 1 aktuell angebot 2 unser topseller 3 sendung verpassen 4 reisevideos 5 favorit 6 bestellnummer 7 reisebüros ausblenden info home live bitte produkt wählen bitte sendetag wählenbitte...
5,301,61151,7_21_770_0_7_71_133,http://tlc-at.gbucket.at/index.html?tlcaustria-sd-sat,gbucket.at,utf-8,study,27427,application/vnd.hbbtv.xhtml+xml,200,54.230.206.96,TLC Austria,2023-10-15T20:04:57.615Z,5,True,True,True,False,False,True,False,False,True,38b3ed78689f9d5cf3ef45b9aff7f5100ff8ad9d,9436366756555465501,E-Mail: oba@at.goldbach.com\nDatenschutzerklärung\nFür die Messung des Nutzungsverhaltens für den Verein Arbeitsgemeinschaft Teletest (kurz AGTT; Details siehe agtt.at/hbb-Messung) wird Ihre Einwi...,e - mail : oooon der datenschutzerklärung für der messung der nutzungsverhaltens für der verein arbeitsgemeinschaft teletest ( kurz agtt ; detail sehen agtt.at/hbb - messung ) werden ihr einwillig...


In [11]:
df_multi["text_lemmatized"] = df_multi.apply(lambda x: lemmatize_text(x["text"], "de"), axis=1)
# df_multi["text_lemmatized_en"] = df_multi.apply(lambda x: lemmatize_text(x["text"], "en"), axis=1) throws error

# Concordance Analysis of "HbbTV"

In [12]:
list_of_hbbtv_keywords = ["HbbTV", "TV", "Fernsehen", "Fernseher"]
list_of_lemmatized_hbbtv_keywords = [lemmatize_text(x, "de") for x in list_of_hbbtv_keywords]
print(f"list_of_lemmatized_hbbtv_keywords: {list_of_hbbtv_keywords}", flush=True)
print(df_de.loc[df_de["text_lemmatized"].str.contains("|".join(list_of_lemmatized_hbbtv_keywords), flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].shape)
df_de.loc[df_de["text_lemmatized"].str.contains("|".join(list_of_lemmatized_hbbtv_keywords), flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].head()

list_of_lemmatized_hbbtv_keywords: ['HbbTV', 'TV', 'Fernsehen', 'Fernseher']
(54, 27)


Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,text,text_lemmatized
0,10,45022,7_1547_3051_0_1115_13141_1,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2072,text/plain,200,52.174.244.81,AstroTV HD,2023-11-02T14:03:58.445Z,1,False,True,True,True,False,True,False,False,False,30b89b7f86e2bb5271350e4822b3854218032087,17821622590391302043,I. Name und Anschrift des Verantwortlichen\nDer Verantwortliche im Sinne der Datenschutz-Grundverordnung und anderer nationaler Datenschutzgesetze der Mitgliedsstaaten sowie sonstiger datenschutzr...,i. name und anschrift der verantwortlich der verantwortliche in der sinn der datenschutz - grundverordnung und anderer national datenschutzgesetze der mitgliedsstaat sowie sonstig datenschutzrecht...
1,37,48950,7_1411_1574_0_33_661_133,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2079,text/plain,200,52.174.244.81,AstroTV,2023-09-16T20:14:43.209Z,3,False,True,True,True,False,True,False,False,False,a29243ef94d0c3ff76d4dd7b06ad2d978c532cc5,18398083342023604123,I. Name und Anschrift des Verantwortlichen\nDer Verantwortliche im Sinne der Datenschutz-Grundverordnung und anderer nationaler Datenschutzgesetze der Mitgliedsstaaten sowie sonstiger datenschutzr...,i. name und anschrift der verantwortlich der verantwortliche in der sinn der datenschutz - grundverordnung und anderer national datenschutzgesetze der mitgliedsstaat sowie sonstig datenschutzrecht...
2,228,91483,7_1415_1868_0_7_61_133,http://nickelodeon-at.gbucket.at/index.html?nnat-sd-sat,gbucket.at,utf-8,study,20518,application/vnd.hbbtv.xhtml+xml,200,18.155.145.96,Nick/Comedy Central Austria,2023-09-20T14:16:56.671Z,3,True,True,True,False,False,True,False,False,True,6bfe65e1409e274da745de7c286acf4654806cd3,11972179217896911647,E-Mail: oba@at.goldbach.com\nDatenschutzerklärung\nFür die Messung des Nutzungsverhaltens für den Verein Arbeitsgemeinschaft Teletest (kurz AGTT; Details siehe agtt.at/hbb-Messung) wird Ihre Einwi...,e - mail : oooon der datenschutzerklärung für der messung der nutzungsverhaltens für der verein arbeitsgemeinschaft teletest ( kurz agtt ; detail sehen agtt.at/hbb - messung ) werden ihr einwillig...
3,274,69891,7_1411_1570_0_33_32_133,https://hbbtvapp.sonnenklar.tv/sat/index.html,sonnenklar.tv,utf-8,study,5771,application/vnd.hbbtv.xhtml+xml,200,51.77.80.89,Sonnenklar TV,2023-09-17T20:00:22.149Z,3,True,False,False,False,True,False,False,False,False,79c8fdae91cecaac7509db09eb757d178600bc7d,6146571371789340086,sonnenklar.TV 1 Aktuelle Angebote 2 Unsere Topseller 3 Sendung verpasst 4 Reisevideos 5 Favoriten 6 Bestellnummer 7 Reisebüros Ausblenden Info Home\nLi...,sonnenklar.tv 1 aktuell angebot 2 unser topseller 3 sendung verpassen 4 reisevideos 5 favorit 6 bestellnummer 7 reisebüros ausblenden info home live bitte produkt wählen bitte sendetag wählenbitte...
5,301,61151,7_21_770_0_7_71_133,http://tlc-at.gbucket.at/index.html?tlcaustria-sd-sat,gbucket.at,utf-8,study,27427,application/vnd.hbbtv.xhtml+xml,200,54.230.206.96,TLC Austria,2023-10-15T20:04:57.615Z,5,True,True,True,False,False,True,False,False,True,38b3ed78689f9d5cf3ef45b9aff7f5100ff8ad9d,9436366756555465501,E-Mail: oba@at.goldbach.com\nDatenschutzerklärung\nFür die Messung des Nutzungsverhaltens für den Verein Arbeitsgemeinschaft Teletest (kurz AGTT; Details siehe agtt.at/hbb-Messung) wird Ihre Einwi...,e - mail : oooon der datenschutzerklärung für der messung der nutzungsverhaltens für der verein arbeitsgemeinschaft teletest ( kurz agtt ; detail sehen agtt.at/hbb - messung ) werden ihr einwillig...


In [13]:
print(df_de.loc[df_de["text_lemmatized"].str.contains("hbbtv", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].shape)
list_of_policies_containing_hbbtv = df_de.loc[df_de["text_lemmatized"].str.contains("hbbtv", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)]["text_lemmatized"].to_list()
for result in list_of_policies_containing_hbbtv:
    # print(re.findall(r'(?:^|\S+\s+\S+)hbbtv(?:\s*\S+\s+\S+|$)', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))
    print(re.findall(r'.{0,20}hbbtv.{0,20}', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))

(40, 27)
[' zu der nutzung der hbbtv - angebot wenn sie ', ' bereitstellung der hbbtv - angebot über der ', 'ein der nutzung der hbbtv - angebot nicht meh', ' bereitstellung der hbbtv - angebot werden au', 'erung und start der hbbtv - angebot zugriffsd']
[' zu der nutzung der hbbtv - angebot wenn sie ', ' bereitstellung der hbbtv - angebot über der ', 'ein der nutzung der hbbtv - angebot nicht meh', ' bereitstellung der hbbtv - angebot werden au', 'erung und start der hbbtv - angebot zugriffsd']
['mit der nutzung von hbbtv für der datenschutz', 'g basierend auf der hbbtv - standard ein beli', 'ät . wir nutzen der hbbtv - technologie , um ', '" , um werbung über hbbtv ausstrahlen zu könn', 'r alt sein . dieser hbbtv - service werden pf']
[' zu der nutzung der hbbtv - angebot wenn sie ', ' bereitstellung der hbbtv - angebot über der ', 'ein der nutzung der hbbtv - angebot nicht meh', ' bereitstellung der hbbtv - angebot werden au', 'erung und start der hbbtv - angebot zugriffsd']
['mit d

In [21]:
print(df_de.loc[df_de["text_lemmatized"].str.contains("blau taste", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].shape)
list_of_policies_containing_hbbtv = df_de.loc[df_de["text_lemmatized"].str.contains("blau taste", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)]["text_lemmatized"].to_list()
for result in list_of_policies_containing_hbbtv:
    # print(re.findall(r'(?:^|\S+\s+\S+)hbbtv(?:\s*\S+\s+\S+|$)', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))
    print(re.findall(r'.{0,100}blau taste.{0,100}', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))

(12, 27)
['. der datenschutzcenter erreichen sie innerhalb unser hbbtv - app jederzeit durch druck|drücken der blau taste auf ihr fernbedienung . anweit hbbtv - app sein von dieser einstellung nicht erfassen . bitten nutz', '. der datenschutzcenter erreichen sie innerhalb unser hbbtv - app jederzeit durch druck|drücken der blau taste auf ihr fernbedienung . anweit hbbtv - app sein von dieser einstellung nicht erfassen . bitten nutz']
[' wahlmöglichkeit und präferenz betreffend cookies und werbung ) vornehmen werden , der sie über der blau taste auf der fernbedienung ihr smart tv - gerät erreichen . der warner bros. discovery - unternehmensfam', ' vorliebe|vorlieben betreffend cookie und werbung ) aufrufen . sie erreichen dieser centre über der blau taste auf der fernbedienung ihr smart-tv - gerät . insbesondere haben sie auch der recht , der verarbeitu', 'ahlmöglichkeit und präferenz betreffend cookie und werbung ) . sie erreichen dieser centre über der blau taste auf der fernbedienung

In [22]:
print(df_de.loc[df_de["text_lemmatized"].str.contains("blaue", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].shape)
list_of_policies_containing_hbbtv = df_de.loc[df_de["text_lemmatized"].str.contains("blaue", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)]["text_lemmatized"].to_list()
for result in list_of_policies_containing_hbbtv:
    # print(re.findall(r'(?:^|\S+\s+\S+)hbbtv(?:\s*\S+\s+\S+|$)', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))
    print(re.findall(r'.{0,100}blaue.{0,100}', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))

(7, 27)
['nsatz von cookie zu werbezwecken nicht wünschen , können sie der tracking an dieser stelle über der blaue farbtaste ihr fernbedienung deaktivieren . der aktuell status können sie in der unten angezeigt men']
['nsatz von cookie zu werbezwecken nicht wünschen , können sie der tracking an dieser stelle über der blaue farbtaste ihr fernbedienung deaktivieren . der aktuell status können sie in der unten angezeigt men']
['nsatz von cookie zu werbezwecken nicht wünschen , können sie der tracking an dieser stelle über der blaue farbtaste ihr fernbedienung deaktivieren . der aktuell status können sie in der unten angezeigt men']
['nsatz von cookie zu werbezwecken nicht wünschen , können sie der tracking an dieser stelle über der blaue farbtaste ihr fernbedienung deaktivieren . der aktuell status können sie in der unten angezeigt men']
['nsatz von cookie zu werbezwecken nicht wünschen , können sie der tracking an dieser stelle über der blaue farbtaste ihr fernbedienung deaktivieren . d

In [24]:
print(df_de.loc[df_de["text_lemmatized"].str.contains("datenschutzcenter", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].shape)
list_of_policies_containing_hbbtv = df_de.loc[df_de["text_lemmatized"].str.contains("datenschutzcenter", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)]["text_lemmatized"].to_list()
for result in list_of_policies_containing_hbbtv:
    # print(re.findall(r'(?:^|\S+\s+\S+)hbbtv(?:\s*\S+\s+\S+|$)', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))
    print(re.findall(r'.{0,100}datenschutzcenter.{0,100}', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))

(8, 27)
['eben . \\n\\n1.\\teinstellungs - möglichkeit \\nsie können unter der menüpunkt " " einstellung " in der datenschutzcenter ihr präferenz zu der datenverarbeitung treffen . für ihr einstellung werden ein cookie setzen , der', ' der entsprechend status speichern . der datenschutzcenter erreichen sie innerhalb unser hbbtv - app jederzeit durch druck|drücken der blau taste auf ihr fern', 'ungen zu analyse , empfehlung & optimierung können sie unter der menüpunkt " " einstellung " in der datenschutzcenter vornehmen.\\n\\nc ) \\tnutzung der skalierbaren zentral messverfahrens \\nin dieser angebot werden der ', 'stellungen zu der interessenbezogen werbung können sie unter der menüpunkt " " einstellung " in der datenschutzcenter vornehmen . \\n\\n3.\\tverarbeitung von datum , der sie wir über kontaktformulare oder per email zu de', 'tatt .\\n\\neinstellungen zu der connector-id können sie unter der menüpunkt " " einstellung " in der datenschutzcenter vornehmen . \\n\\nstand : juni 202

In [14]:
list_of_tokenized_policies_containing_hbbtv = somajo_tokenizer_multiple_texts(list_of_policies_containing_hbbtv, "de")

<map object at 0x7f5866742080>


In [15]:
conc_results1 = ct.concord(list_of_tokenized_policies_containing_hbbtv,["hbbtv"],nhits = 215)
for x in conc_results1:
	print(x)

Search returned 210 hits.
 Returning all 210 hits
[['192732', 'jennifer', 'reichelt', 'iii', 'allgemein', 'hinweis', 'zu', 'der', 'nutzung', 'der'], 'hbbtv', ['-', 'angebot', 'wenn', 'sie', 'ihr', 'fernsehgerät', 'an', 'der', 'internet', 'anschließen']]
[['ein', 'verbindung', 'zu', 'unser', 'webserver', 'herstellen', 'zu', 'der', 'bereitstellung', 'der'], 'hbbtv', ['-', 'angebot', 'über', 'der', 'internet', 'sein', 'es', 'technisch', 'unerlässlich', 'ihr']]
[['der', 'internetfunktionalität', 'der', 'gerät', 'ausschalten', 'dann', 'sein', 'der', 'nutzung', 'der'], 'hbbtv', ['-', 'angebot', 'nicht', 'mehr', 'möglich', 'wie', 'sie', 'ein', 'solcher', 'schritt']]
[['in', 'der', 'bedienungsanleitung', 'ihr', 'empfänger', 'iv', 'für', 'der', 'bereitstellung', 'der'], 'hbbtv', ['-', 'angebot', 'werden', 'auf', 'folgend', 'dienstleistung', 'zurückgreifen', 'signalisierung', 'und', 'start']]
[['angebot', 'werden', 'auf', 'folgend', 'dienstleistung', 'zurückgreifen', 'signalisierung', 'und', 'st

In [16]:
collocates = ct.collocator(list_of_tokenized_policies_containing_hbbtv,"hbbtv",stat = "LogDice")
ct.head(collocates, hits = 25)

app	12.077802151603633
version	11.338934520193053
websites	11.179821037584812
angebote\	11.179821037584812
anweit	11.179821037584812
\n•\tsystembzw	11.179821037584812
browserinformationen	11.179821037584812
smarttv	11.09310940439148
fernbedienung	11.011315313227835
benötigen	10.977632186971546
zugang	10.89147554322183
erreichen	10.881058927276493
angebot	10.604632480861289
zusammen	10.569547448334468
service	10.566013397920166
innerhalb	10.53647562672882
standard	10.527512228537256
tv	10.467059711627126
-	10.451858752290379
jederzeit	10.313499472816781
nutzung	10.263728176003879
bereitstellung	10.259759273800933
durch	10.208412393800712
um	10.100824369519486
hamburg	10.07518749639422


In [17]:
print(df_en.loc[df_en["text_lemmatized"].str.contains("hbbtv", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].shape)
list_of_policies_containing_hbbtv = df_en.loc[df_en["text_lemmatized"].str.contains("hbbtv", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)]["text_lemmatized"].to_list()
for result in list_of_policies_containing_hbbtv:
    # print(re.findall(r'(?:^|\S+\s+\S+)hbbtv(?:\s*\S+\s+\S+|$)', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))
    print(re.findall(r'.{0,20}hbbtv.{0,20}', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))

(1, 27)
['hbbtv ard test portal \\n ', ' process and use by hbbtv application be a im', 'complaint \\n \\n the hbbtv ard test portal be ']


In [18]:
print(df_multi.loc[df_multi["text_lemmatized"].str.contains("hbbtv", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)].shape)
list_of_policies_containing_hbbtv = df_multi.loc[df_multi["text_lemmatized"].str.contains("hbbtv", flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE)]["text_lemmatized"].to_list()
for result in list_of_policies_containing_hbbtv:
    # print(re.findall(r'(?:^|\S+\s+\S+)hbbtv(?:\s*\S+\s+\S+|$)', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))
    print(re.findall(r'.{0,20}hbbtv.{0,20}', result, flags=re.IGNORECASE | re.DOTALL | re.UNICODE | re.MULTILINE))

(1, 27)
[" ) cmpdomain ='pro7.hbbtv ';else if( prod = =", " cmpdomain ='pro7at.hbbtv ';else if( pid = ='", ") cmpdomain = 'rtl2.hbbtv ';else if( pid = ='", "main = 'couchplaytv.hbbtv ';else if( pid = ='"]


## Detect GDPR Keywords

In [19]:
def check_keyword(text, keyword):
  keyword_count = text.count(keyword)
  return keyword_count

In [20]:
list_of_keyphrases_de = ["Verantwortliche",
                         "Datenschutzbeauftragte",
                         "Zweck",
                         "Rechtsgrundlage",
                         "Verarbeitung",
                         "berechtigte Interessen",
                         "Empfänger",
                         "Drittland",
                         "Dauer",
                         "Auskunft",
                         "Berichtigung",
                         "Löschung",
                         "Einschränkung",
                         "Widerspruchsrecht",
                         "Datenübertragbarkeit",
                         "Einwilligung widerrufen",
                         "Beschwerde",
                         "Aufsichtsbehörde",
                         "Vertrag",
                         "gesetzlich vorgeschrieben",
                         "vertraglich vorgeschrieben",
                         "Folgen",
                         "automatisierte Entscheidungsfindung",
                         "Profiling",
                         "Weiterverarbeitung",
                         "Einwilligung",
                         "Erfüllung eines Vertrags",
                         "rechtliche Verpflichtung",
                         "lebenswichtiges Interesse",
                         "öffentliches Interesse",
                         "öffentliche Gewalt",
                         "Behörde"]
list_of_keyphrases_en = ["controller",
                         "data protection officer",
                         "purpose",
                         "legal basis",
                         "processing",
                         "legitimate interests",
                         "recipients",
                         "third country",
                         "period",
                         "access",
                         "rectification",
                         "erasure",
                         "restriction",
                         "object",
                         "data portability",
                         "withdraw consent",
                         "complaint",
                         "supervisory authority",
                         "contract",
                         "statutory requirement",
                         "contractual requirement",
                         "consequences",
                         "automated decisionmaking",
                         "profiling",
                         "further processing",
                         "consent",
                         "performance of a contract",
                         "legal obligation",
                         "vital interest",
                         "public interest",
                         "official authority",
                         "public authority"
                        ]
assert len(list_of_keyphrases_de) == len(list_of_keyphrases_en)
list_of_keyphrases_de = [lemmatize_text(x, "de").lower() for x in list_of_keyphrases_de]
list_of_keyphrases_en = [lemmatize_text(x, "en").lower() for x in list_of_keyphrases_en]
print(f"list_of_keyphrases_de: {list_of_keyphrases_de}")
print(f"list_of_keyphrases_en: {list_of_keyphrases_en}")

list_of_keyphrases_de: ['verantwortliche', 'datenschutzbeauftragte', 'zweck', 'rechtsgrundlage', 'verarbeitung', 'berechtigen interesse', 'empfänger', 'drittland', 'dauer', 'auskunft', 'berichtigung', 'löschung', 'einschränkung', 'widerspruchsrecht', 'datenübertragbarkeit', 'einwilligung widerrufen', 'beschwerde', 'aufsichtsbehörde', 'vertrag', 'gesetzlich vorschreiben', 'vertraglich vorschreiben', 'folge', 'automatisiert entscheidungsfindung', 'profiling', 'weiterverarbeitung', 'einwilligung', 'erfüllung ein vertrag', 'rechtlich verpflichtung', 'lebenswichtig interesse', 'öffentlich interesse', 'öffentlich gewalt', 'behörde']
list_of_keyphrases_en: ['controller', 'datum protection officer', 'purpose', 'legal basis', 'processing', 'legitimate interest', 'recipient', 'third country', 'period', 'access', 'rectification', 'erasure', 'restriction', 'object', 'datum portability', 'withdraw consent', 'complaint', 'supervisory authority', 'contract', 'statutory requirement', 'contractual requ

In [21]:
for keyword in list_of_keyphrases_de:
  df_de[keyword] = df_de.apply(lambda row: check_keyword(row["text_lemmatized"], keyword), axis=1)

In [22]:
for keyword in list_of_keyphrases_en:
  df_en[keyword] = df_en.apply(lambda row: check_keyword(row["text_lemmatized"], keyword), axis=1)

In [23]:
for keyword in list_of_keyphrases_de:
  df_multi[keyword] = df_multi.apply(lambda row: check_keyword(row["text_lemmatized"], keyword), axis=1)
for keyword in list_of_keyphrases_en:
  df_multi[keyword] = df_multi.apply(lambda row: check_keyword(row["text"], keyword), axis=1) #lemmatized not possible due to error. see above.

In [24]:
for keyword in list_of_keyphrases_en:
  df_multi[keyword] = df_multi.apply(lambda row: check_keyword(row["text"], keyword), axis=1)

In [25]:
df_de.drop(["text"], axis=1).to_csv("../results/policies_evaluations_de_gdpr_phrases.csv", sep=";", encoding="utf-8", index=False)
df_de.head()

Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,text,text_lemmatized,verantwortliche,datenschutzbeauftragte,zweck,rechtsgrundlage,verarbeitung,berechtigen interesse,empfänger,drittland,dauer,auskunft,berichtigung,löschung,einschränkung,widerspruchsrecht,datenübertragbarkeit,einwilligung widerrufen,beschwerde,aufsichtsbehörde,vertrag,gesetzlich vorschreiben,vertraglich vorschreiben,folge,automatisiert entscheidungsfindung,profiling,weiterverarbeitung,einwilligung,erfüllung ein vertrag,rechtlich verpflichtung,lebenswichtig interesse,öffentlich interesse,öffentlich gewalt,behörde
0,10,45022,7_1547_3051_0_1115_13141_1,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2072,text/plain,200,52.174.244.81,AstroTV HD,2023-11-02T14:03:58.445Z,1,False,True,True,True,False,True,False,False,False,30b89b7f86e2bb5271350e4822b3854218032087,17821622590391302043,I. Name und Anschrift des Verantwortlichen\nDer Verantwortliche im Sinne der Datenschutz-Grundverordnung und anderer nationaler Datenschutzgesetze der Mitgliedsstaaten sowie sonstiger datenschutzr...,i. name und anschrift der verantwortlich der verantwortliche in der sinn der datenschutz - grundverordnung und anderer national datenschutzgesetze der mitgliedsstaat sowie sonstig datenschutzrecht...,1,0,2,1,10,0,2,0,0,1,1,2,1,0,1,0,1,3,3,0,0,1,0,0,0,3,0,0,0,0,0,3
1,37,48950,7_1411_1574_0_33_661_133,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2079,text/plain,200,52.174.244.81,AstroTV,2023-09-16T20:14:43.209Z,3,False,True,True,True,False,True,False,False,False,a29243ef94d0c3ff76d4dd7b06ad2d978c532cc5,18398083342023604123,I. Name und Anschrift des Verantwortlichen\nDer Verantwortliche im Sinne der Datenschutz-Grundverordnung und anderer nationaler Datenschutzgesetze der Mitgliedsstaaten sowie sonstiger datenschutzr...,i. name und anschrift der verantwortlich der verantwortliche in der sinn der datenschutz - grundverordnung und anderer national datenschutzgesetze der mitgliedsstaat sowie sonstig datenschutzrecht...,1,0,2,1,10,0,2,0,0,1,1,2,1,0,1,0,1,3,3,0,0,1,0,0,0,3,0,0,0,0,0,3
2,228,91483,7_1415_1868_0_7_61_133,http://nickelodeon-at.gbucket.at/index.html?nnat-sd-sat,gbucket.at,utf-8,study,20518,application/vnd.hbbtv.xhtml+xml,200,18.155.145.96,Nick/Comedy Central Austria,2023-09-20T14:16:56.671Z,3,True,True,True,False,False,True,False,False,True,6bfe65e1409e274da745de7c286acf4654806cd3,11972179217896911647,E-Mail: oba@at.goldbach.com\nDatenschutzerklärung\nFür die Messung des Nutzungsverhaltens für den Verein Arbeitsgemeinschaft Teletest (kurz AGTT; Details siehe agtt.at/hbb-Messung) wird Ihre Einwi...,e - mail : oooon der datenschutzerklärung für der messung der nutzungsverhaltens für der verein arbeitsgemeinschaft teletest ( kurz agtt ; detail sehen agtt.at/hbb - messung ) werden ihr einwillig...,1,0,0,0,6,0,3,0,1,1,2,3,2,0,0,0,1,0,0,0,0,1,0,0,0,5,0,0,0,0,0,1
3,274,69891,7_1411_1570_0_33_32_133,https://hbbtvapp.sonnenklar.tv/sat/index.html,sonnenklar.tv,utf-8,study,5771,application/vnd.hbbtv.xhtml+xml,200,51.77.80.89,Sonnenklar TV,2023-09-17T20:00:22.149Z,3,True,False,False,False,True,False,False,False,False,79c8fdae91cecaac7509db09eb757d178600bc7d,6146571371789340086,sonnenklar.TV 1 Aktuelle Angebote 2 Unsere Topseller 3 Sendung verpasst 4 Reisevideos 5 Favoriten 6 Bestellnummer 7 Reisebüros Ausblenden Info Home\nLi...,sonnenklar.tv 1 aktuell angebot 2 unser topseller 3 sendung verpassen 4 reisevideos 5 favorit 6 bestellnummer 7 reisebüros ausblenden info home live bitte produkt wählen bitte sendetag wählenbitte...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
5,301,61151,7_21_770_0_7_71_133,http://tlc-at.gbucket.at/index.html?tlcaustria-sd-sat,gbucket.at,utf-8,study,27427,application/vnd.hbbtv.xhtml+xml,200,54.230.206.96,TLC Austria,2023-10-15T20:04:57.615Z,5,True,True,True,False,False,True,False,False,True,38b3ed78689f9d5cf3ef45b9aff7f5100ff8ad9d,9436366756555465501,E-Mail: oba@at.goldbach.com\nDatenschutzerklärung\nFür die Messung des Nutzungsverhaltens für den Verein Arbeitsgemeinschaft Teletest (kurz AGTT; Details siehe agtt.at/hbb-Messung) wird Ihre Einwi...,e - mail : oooon der datenschutzerklärung für der messung der nutzungsverhaltens für der verein arbeitsgemeinschaft teletest ( kurz agtt ; detail sehen agtt.at/hbb - messung ) werden ihr einwillig...,2,0,2,0,7,0,3,0,2,1,2,3,2,0,0,0,2,0,0,0,0,2,0,0,0,12,0,0,0,0,0,1


In [26]:
df_en.drop(["text"], axis=1).to_csv("../results/policies_evaluations_en_gdpr_phrases.csv", sep=";", encoding="utf-8", index=False)
df_en.head()

Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,text,text_lemmatized,controller,datum protection officer,purpose,legal basis,processing,legitimate interest,recipient,third country,period,access,rectification,erasure,restriction,object,datum portability,withdraw consent,complaint,supervisory authority,contract,statutory requirement,contractual requirement,consequence,automated decisionmaking,profiling,further processing,consent,performance of a contract,legal obligation,vital interest,public interest,official authority,public authority
0,20283,73529,7_1525_2917_0_1073_28221_1,http://itv-tp.ard.de/backend/public/api/v1/portal/configs.json/privacy_text,ard.de,utf-8,study,4263,application/json; charset=utf-8,200,34.120.66.190,ARD-TEST-1,2023-09-18T01:19:51.275Z,3,False,True,True,True,True,False,False,False,False,f26130e58d4684b357e869667ae48d26fb5bab24,16829530362413899507,"HbbTV ARD Test Portal\n\n\nThe protection of personal data which is collected, processed and used by HbbTV applications is an important concern for the ARD. This document tells you which data the ...","hbbtv ard test portal \n the protection of personal datum which be collect , process and use by hbbtv application be a important concern for the ard . this document tell you which datum the ard te...",0,0,1,0,1,0,0,0,0,8,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
df_multi.drop(["text"], axis=1).to_csv("../results/policies_evaluations_multilingual_gdpr_phrases.csv", sep=";", encoding="utf-8", index=False)
df_multi.head()

Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,text,text_lemmatized,verantwortliche,datenschutzbeauftragte,zweck,rechtsgrundlage,verarbeitung,berechtigen interesse,empfänger,drittland,dauer,auskunft,berichtigung,löschung,einschränkung,widerspruchsrecht,datenübertragbarkeit,einwilligung widerrufen,beschwerde,aufsichtsbehörde,vertrag,gesetzlich vorschreiben,vertraglich vorschreiben,folge,automatisiert entscheidungsfindung,profiling,weiterverarbeitung,einwilligung,erfüllung ein vertrag,rechtlich verpflichtung,lebenswichtig interesse,öffentlich interesse,öffentlich gewalt,behörde,controller,datum protection officer,purpose,legal basis,processing,legitimate interest,recipient,third country,period,access,rectification,erasure,restriction,object,datum portability,withdraw consent,complaint,supervisory authority,contract,statutory requirement,contractual requirement,consequence,automated decisionmaking,further processing,consent,performance of a contract,legal obligation,vital interest,public interest,official authority,public authority
0,45324,54955,7_1543_3183_0_1107_17509_1,http://pro7.gofresh.tv/pro7/js/sibbo-cmp-couchplay-samsung-tv-en-config-tv_v31.js,gofresh.tv,utf-8,study,15506,application/x-javascript; charset=utf-8,200,195.24.107.225,kabel eins Doku,2023-09-17T03:42:58.001Z,3,True,False,False,False,False,True,False,False,False,ab2e094a92d2bed3188fcb17eb8c7a0baabbc66c,6337248614974323047,"document.addEventListener(""DOMContentLoaded"",function() {if(typeof Promise=='undefined') {ErrorReports.log('sibbo_config.js',7,'Promise not set: '+typeof SibboCMP);} else if(typeof SibboCMP!=='und...","document.addeventlistener( "" domcontentloaded "" , function ( ) { if( typeof promise = = 'undefined ' ) { errorreports.log ( ' sibbo_config.js ',7,'promise not set : ' +typeof sibbocmp ) ; } else i...",1,2,8,0,8,0,2,0,1,4,2,4,0,0,0,0,3,4,2,0,0,6,0,0,0,6,1,0,0,0,0,4,5,0,24,1,28,5,2,0,3,5,1,0,6,3,0,0,4,0,11,0,0,0,0,0,29,0,0,0,1,0,0


In [28]:
df_de[list_of_keyphrases_de].describe()

Unnamed: 0,verantwortliche,datenschutzbeauftragte,zweck,rechtsgrundlage,verarbeitung,berechtigen interesse,empfänger,drittland,dauer,auskunft,berichtigung,löschung,einschränkung,widerspruchsrecht,datenübertragbarkeit,einwilligung widerrufen,beschwerde,aufsichtsbehörde,vertrag,gesetzlich vorschreiben,vertraglich vorschreiben,folge,automatisiert entscheidungsfindung,profiling,weiterverarbeitung,einwilligung,erfüllung ein vertrag,rechtlich verpflichtung,lebenswichtig interesse,öffentlich interesse,öffentlich gewalt,behörde
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,2.436364,2.145455,7.763636,3.418182,18.8,0.181818,1.218182,0.036364,3.345455,1.363636,1.090909,2.327273,1.036364,0.454545,0.163636,0.0,1.163636,0.527273,11.072727,0.072727,0.0,12.781818,0.054545,0.018182,0.0,11.618182,0.0,0.018182,0.018182,0.0,0.0,1.309091
std,4.054095,3.571586,12.6432,5.573386,29.35315,0.389249,1.272078,0.188919,4.097055,1.994099,0.866511,2.687795,0.942095,1.06837,0.373355,0.0,1.014043,0.997303,24.992484,0.262082,0.0,22.080401,0.229184,0.13484,0.0,16.532225,0.0,0.13484,0.13484,0.0,0.0,1.317584
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,2.0,0.0,7.0,0.0,1.0,0.0,2.0,1.0,1.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,2.0,4.0,9.0,4.0,12.0,0.0,2.0,0.0,4.0,1.0,2.0,3.0,2.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,11.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,3.0
max,12.0,10.0,37.0,16.0,88.0,1.0,3.0,1.0,12.0,6.0,3.0,8.0,3.0,3.0,1.0,0.0,3.0,3.0,71.0,1.0,0.0,65.0,1.0,1.0,0.0,50.0,0.0,1.0,1.0,0.0,0.0,4.0


In [29]:
df_en[list_of_keyphrases_en].describe()

Unnamed: 0,controller,datum protection officer,purpose,legal basis,processing,legitimate interest,recipient,third country,period,access,rectification,erasure,restriction,object,datum portability,withdraw consent,complaint,supervisory authority,contract,statutory requirement,contractual requirement,consequence,automated decisionmaking,profiling,further processing,consent,performance of a contract,legal obligation,vital interest,public interest,official authority,public authority
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
print('Coverage')
print('\n')
print(df_de[df_de[list_of_keyphrases_de] > 0.0].count()[list_of_keyphrases_de])
print('\n')
print('Occurences')
print('\n')
print(df_de[df_de[list_of_keyphrases_de] > 0.0].sum()[list_of_keyphrases_de])

Coverage


verantwortliche                       34
datenschutzbeauftragte                19
zweck                                 37
rechtsgrundlage                       26
verarbeitung                          44
berechtigen interesse                 10
empfänger                             28
drittland                              2
dauer                                 42
auskunft                              34
berichtigung                          38
löschung                              33
einschränkung                         33
widerspruchsrecht                      9
datenübertragbarkeit                   9
einwilligung widerrufen                0
beschwerde                            36
aufsichtsbehörde                      13
vertrag                               23
gesetzlich vorschreiben                4
vertraglich vorschreiben               0
folge                                 50
automatisiert entscheidungsfindung     3
profiling                              1
weite

In [31]:
print('Coverage')
print('\n')
print(df_en[df_en[list_of_keyphrases_en] > 0.0].count()[list_of_keyphrases_en])
print('\n')
print('Occurences')
print('\n')
print(df_en[df_en[list_of_keyphrases_en] > 0.0].sum()[list_of_keyphrases_en])

Coverage


controller                   0
datum protection officer     0
purpose                      1
legal basis                  0
processing                   1
legitimate interest          0
recipient                    0
third country                0
period                       0
access                       1
rectification                0
erasure                      0
restriction                  0
object                       0
datum portability            0
withdraw consent             0
complaint                    1
supervisory authority        0
contract                     0
statutory requirement        0
contractual requirement      0
consequence                  0
automated decisionmaking     0
profiling                    0
further processing           0
consent                      0
performance of a contract    0
legal obligation             0
vital interest               0
public interest              0
official authority           0
public authority            

In [32]:
print('Coverage')
print('\n')
print(df_multi[df_multi[list_of_keyphrases_de] > 0.0].count()[list_of_keyphrases_de])
print('\n')
print('Occurences')
print('\n')
print(df_multi[df_multi[list_of_keyphrases_de] > 0.0].sum()[list_of_keyphrases_de])

Coverage


verantwortliche                       1
datenschutzbeauftragte                1
zweck                                 1
rechtsgrundlage                       0
verarbeitung                          1
berechtigen interesse                 0
empfänger                             1
drittland                             0
dauer                                 1
auskunft                              1
berichtigung                          1
löschung                              1
einschränkung                         0
widerspruchsrecht                     0
datenübertragbarkeit                  0
einwilligung widerrufen               0
beschwerde                            1
aufsichtsbehörde                      1
vertrag                               1
gesetzlich vorschreiben               0
vertraglich vorschreiben              0
folge                                 1
automatisiert entscheidungsfindung    0
profiling                             0
weiterverarbeitung           

### German

In [33]:
language = "de"
measurement = "study"
df_de_original = pd.read_csv("../results/policies_evaluations_" + language + "_" + measurement + ".csv", sep=";", encoding="utf-8")
print(df_de_original.shape)
df_de_corrected_labels = pd.read_csv("../results/policies_evaluations_" + language + "_" + measurement + "_FN_labels_corrected.csv", sep=";", encoding="utf-8")
print(df_de_corrected_labels.shape)
df_de = pd.concat([df_de_original, df_de_corrected_labels], ignore_index=True)
df_de.to_csv("../results/policies_evaluations_" + language + "_" + measurement + "_merged.csv", sep=";", encoding="utf-8", index=False)
df_de.shape

(2479, 49)
(173, 49)


(2652, 49)

In [34]:
print(df_de.drop_duplicates(subset=["sha1", "channel"]).shape)
df_de.drop_duplicates(subset=["sha1", "channel"], inplace=True)
df_de.head()

(55, 49)


Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,First_party_collection_or_use,Third_party_collection_or_use,Information_type,Purpose,Collection_process,Legal_basis_for_collection,Third_party_entity,Purpose_Essential_service_or_feature,Purpose_Advertising_or_marketing,Purpose_Analytics_or_research,Purpose_Service_operation_and_security,Purpose_Legal_requirement,Information_type_Financial,Collection_Process_Shared_by_first_party_with_a_third_party,Information_type_Contact_information,Information_type_Location,Information_type_Demographic_data,Information_type_User_online_activities,Information_type_IP_address_and_device_IDs,Information_type_Cookies_and_tracking_elements,Information_type_Computer_information,Information_type_Generic_personal_information,Collection_Process_Collected_on_first_party_website_app,Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party
0,10,45022,7_1547_3051_0_1115_13141_1,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2072,text/plain,200,52.174.244.81,AstroTV HD,2023-11-02T14:03:58.445Z,1,False,True,True,True,False,True,False,False,False,30b89b7f86e2bb5271350e4822b3854218032087,17821622590391302043,6,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
1,37,48950,7_1411_1574_0_33_661_133,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2079,text/plain,200,52.174.244.81,AstroTV,2023-09-16T20:14:43.209Z,3,False,True,True,True,False,True,False,False,False,a29243ef94d0c3ff76d4dd7b06ad2d978c532cc5,18398083342023604123,6,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
2,313,49632,7_28_890_0_1115_13141_1,http://hbbtv.bmt-technik.de/portal/astrotv/privacy_astrotv.txt,bmt-technik.de,utf-8,study,2079,text/plain,200,52.174.244.81,AstroTV HD,2023-09-16T21:10:40.418Z,3,False,True,True,True,False,True,False,False,False,a29243ef94d0c3ff76d4dd7b06ad2d978c532cc5,18398083342023604123,6,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0
4,1741,38949,7_22_782_0_1089_12030_1,http://p-cdn.superrtl.de/ct/,superrtl.de,utf-8,study,10013,text/html,200,2.21.228.121,TOGGO plus,2023-09-25T21:40:28.880Z,4,True,True,True,False,True,False,False,False,True,0651f08b73724ab23bf72728c4ba465e5ae43607,5741653445313610255,1,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1743,10698,7_23_791_0_1091_28815_1,http://p-cdn.superrtl.de/ct/?cc=at,superrtl.de,utf-8,study,10061,text/html,200,184.24.77.144,SUPER RTL A,2023-10-31T19:36:13.386Z,1,True,True,True,False,True,False,False,False,True,0651f08b73724ab23bf72728c4ba465e5ae43607,5741653445313610255,1,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
data_practice_columns = {"First_party_collection_or_use",
                            "Third_party_collection_or_use",
                        	"Information_type",
                            "Purpose",
                            "Collection_process",
                            "Legal_basis_for_collection",
                        	"Third_party_entity",
                            "Purpose_Essential_service_or_feature",
                        	"Purpose_Advertising_or_marketing",
                            "Purpose_Analytics_or_research",
                            "Purpose_Service_operation_and_security",
                            "Purpose_Legal_requirement",
                            "Information_type_Financial",
                            "Collection_Process_Shared_by_first_party_with_a_third_party",
                            "Information_type_Contact_information",
                            "Information_type_Location",
                            "Information_type_Demographic_data",
                            "Information_type_User_online_activities",
                            "Information_type_IP_address_and_device_IDs",
                            "Information_type_Cookies_and_tracking_elements",
                            "Information_type_Computer_information",
                            "Information_type_Generic_personal_information",
                            "Collection_Process_Collected_on_first_party_website_app",
                            "Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party"
                        }

In [36]:
df_de[data_practice_columns].describe()

Unnamed: 0,Third_party_collection_or_use,Information_type_Cookies_and_tracking_elements,Collection_Process_Shared_by_first_party_with_a_third_party,Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party,Information_type_Computer_information,Purpose_Advertising_or_marketing,Information_type_Generic_personal_information,Information_type_IP_address_and_device_IDs,Purpose_Essential_service_or_feature,Information_type_Contact_information,Collection_Process_Collected_on_first_party_website_app,Purpose_Service_operation_and_security,Information_type,Purpose_Analytics_or_research,Third_party_entity,Information_type_User_online_activities,Information_type_Demographic_data,Collection_process,Information_type_Financial,First_party_collection_or_use,Purpose_Legal_requirement,Information_type_Location,Legal_basis_for_collection,Purpose
count,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0,55.0
mean,1.636364,0.436364,0.018182,0.072727,0.381818,0.6,0.345455,1.363636,0.127273,0.072727,0.018182,0.036364,3.254545,1.163636,0.745455,0.218182,0.0,1.2,0.0,6.781818,0.018182,0.345455,1.4,4.454545
std,2.548519,1.032143,0.13484,0.262082,0.706869,0.934919,0.725672,2.414184,0.33635,0.262082,0.13484,0.26968,3.104466,1.330046,1.897544,0.497807,0.0,1.98513,0.0,6.7924,0.13484,0.798568,2.909754,5.62013
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,2.0
75%,2.0,1.0,0.0,0.0,1.0,1.0,0.5,1.0,0.0,0.0,0.0,0.0,4.5,1.0,1.0,0.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,5.0
max,14.0,7.0,1.0,1.0,4.0,4.0,4.0,8.0,1.0,1.0,1.0,2.0,18.0,7.0,13.0,2.0,0.0,10.0,0.0,34.0,1.0,3.0,10.0,20.0


**Count of values greater than 0 per category**

In [37]:
print('Coverage')
print('\n')
print(df_de[df_de[data_practice_columns] > 0].count()[data_practice_columns])
print('\n')
print('Occurences')
print('\n')
print(df_de[df_de[data_practice_columns] > 0].sum()[data_practice_columns])

Coverage


Third_party_collection_or_use                                              28
Information_type_Cookies_and_tracking_elements                             17
Collection_Process_Shared_by_first_party_with_a_third_party                 1
Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party     4
Information_type_Computer_information                                      17
Purpose_Advertising_or_marketing                                           22
Information_type_Generic_personal_information                              14
Information_type_IP_address_and_device_IDs                                 21
Purpose_Essential_service_or_feature                                        7
Information_type_Contact_information                                        4
Collection_Process_Collected_on_first_party_website_app                     1
Purpose_Service_operation_and_security                                      1
Information_type                                     

# English

In [38]:
language = "en"
measurement = "study"
df_en = pd.read_csv("../results/policies_evaluations_" + language + "_" + measurement + ".csv", sep=";", encoding="utf-8")
df_en.shape

(3, 48)

In [39]:
data_practice_columns = {"First_party_collection_or_use",
                            "Third_party_collection_or_use",
                        	"Information_type",
                            "Purpose",
                            "Collection_process",
                            "Legal_basis_for_collection",
                        	"Third_party_entity",
                            "Purpose_Essential_service_or_feature",
                        	"Purpose_Advertising_or_marketing",
                            "Purpose_Analytics_or_research",
                            "Purpose_Service_operation_and_security",
                            "Purpose_Legal_requirement",
                            "Information_type_Financial",
                            "Collection_Process_Shared_by_first_party_with_a_third_party",
                            "Information_type_Contact_information",
                            "Information_type_Location",
                            "Information_type_Demographic_data",
                            "Information_type_User_online_activities",
                            "Information_type_IP_address_and_device_IDs",
                            "Information_type_Cookies_and_tracking_elements",
                            "Information_type_Generic_personal_information",
                            "Collection_Process_Collected_on_first_party_website_app",
                            "Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party"
                        }

In [40]:
print(df_en.drop_duplicates(subset=["sha1", "channel"]).shape)
df_en.drop_duplicates(subset=["sha1", "channel"], inplace=True)
df_en.head()

(1, 48)


Unnamed: 0,text_id,request_id,channel_id,url,policy_domain,html_encoding,crawl,size,type,status,ip_address,channel,time_stamp,scan_profile,cb_candidate,cb_candidate2,pp_candidate,pp_candidate_strict,is_first_party,is_third_party,is_iptv,legal_candidate,contact_candidate,sha1,simhash,First_party_collection_or_use,Third_party_collection_or_use,Information_type,Purpose,Collection_process,Legal_basis_for_collection,Third_party_entity,Purpose_Advertising_or_marketing,Purpose_Analytics_or_research,Purpose_Service_operation_and_security,Information_type_Contact_information,Information_type_Location,Information_type_Demographic_data,Information_type_Generic_personal_information,Purpose_Essential_service_or_feature,Purpose_Legal_requirement,Collection_Process_Collected_on_first_party_website_app,Collection_Process_Shared_by_first_party_with_a_third_party,Information_type_Financial,Information_type_User_online_activities,Information_type_IP_address_and_device_IDs,Information_type_Cookies_and_tracking_elements,Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party
0,20283,73529,7_1525_2917_0_1073_28221_1,http://itv-tp.ard.de/backend/public/api/v1/portal/configs.json/privacy_text,ard.de,utf-8,study,4263,application/json; charset=utf-8,200,34.120.66.190,ARD-TEST-1,2023-09-18T01:19:51.275Z,3,False,True,True,True,True,False,False,False,False,f26130e58d4684b357e869667ae48d26fb5bab24,16829530362413899507,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0


In [41]:
df_en[data_practice_columns].describe()

Unnamed: 0,Third_party_collection_or_use,Information_type_Cookies_and_tracking_elements,Collection_Process_Shared_by_first_party_with_a_third_party,Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party,Purpose_Advertising_or_marketing,Information_type_Generic_personal_information,Information_type_IP_address_and_device_IDs,Purpose_Essential_service_or_feature,Information_type_Contact_information,Collection_Process_Collected_on_first_party_website_app,Purpose_Service_operation_and_security,Information_type,Purpose_Analytics_or_research,Third_party_entity,Information_type_User_online_activities,Information_type_Demographic_data,Collection_process,Information_type_Financial,First_party_collection_or_use,Purpose_Legal_requirement,Information_type_Location,Legal_basis_for_collection,Purpose
count,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
std,,,,,,,,,,,,,,,,,,,,,,,
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
print('Coverage')
print('\n')
print(df_en[df_en[data_practice_columns] > 0].count()[data_practice_columns])
print('\n')
print('Occurences')
print('\n')
print(df_en[df_en[data_practice_columns] > 0].sum()[data_practice_columns])

Coverage


Third_party_collection_or_use                                              0
Information_type_Cookies_and_tracking_elements                             0
Collection_Process_Shared_by_first_party_with_a_third_party                0
Legal_basis_for_collection_Legitimate_interests_of_first_or_third_party    0
Purpose_Advertising_or_marketing                                           0
Information_type_Generic_personal_information                              0
Information_type_IP_address_and_device_IDs                                 1
Purpose_Essential_service_or_feature                                       0
Information_type_Contact_information                                       0
Collection_Process_Collected_on_first_party_website_app                    1
Purpose_Service_operation_and_security                                     0
Information_type                                                           1
Purpose_Analytics_or_research                                    