In [1]:
import pandas as pd
import os
import numpy as np
import utils
import re

In [2]:
categories_key = {
    1: "Gewerkschaftszugehörigkeit",
    2: "rassische und ethnische Herkunft",
    3: "Gesundheitsdaten",
    4: "Sexualleben/sexuelle Orientierung",
    5: "religiöse oder weltanschauliche Überzeugungen",
    6: "Kinder",
    7: "Finanzstatus",
    8: "persönliche Schwächen",
    9: "politische Meinungen",
}

In [3]:
df_hand_filtered = pd.read_csv("sensitive_eu_segments_hand_filtered.csv")

# Join hand-filtered segments with original dataset

In [4]:
df_orig = pd.read_csv("xandr_segments.csv")
df_orig = df_orig.rename(columns={"Segment Name": "segment_name",
                                  "Data Provider Name": "provider_name",
                                  "Data Provider ID": "provider_id",
                                  "Segment ID": "id"})

# set index for both datasets; required for join
df_orig = df_orig.set_index("id")
df_hand_filtered = df_hand_filtered.set_index("id")

In [5]:
df_full = pd.concat([df_orig, df_hand_filtered], axis=1, join="inner")
df_full = df_full.rename(columns={"name": "name_processed", "main_category": "category", "id": "segment_id"})
df_full = df_full.sort_values(by=["category", "country", "name_processed"])

# reorganize dataframe
columns = ['category', 'country', 'segment_name', 'hit',
        'provider_name', 'provider_id', 'name_processed']
df_full = df_full[columns]



# Filter contextual segments

Since Contextual advertising is far less intrusive, decided to remove some of the better known providers of contextual advertisement. This list is not exhaustive, and some contextual segments wil most certainly remain in the dataset

In [6]:
old_len = len(df_full)
contextual_names = utils.list_to_regex(["grapeshot", "peer39", "emetriq"])

def filter_contextual(row):
    return bool(re.search(contextual_names, row["provider_name"])) or bool(re.match(contextual_names, row["name_processed"]))

df_full = df_full[df_full.apply(lambda x: not filter_contextual(x), axis=1)]
print(f"dataset size before: {old_len}, after: {len(df_full)}")

dataset size before: 2045, after: 1900


# Category frequency

In [7]:
df_full["category"].value_counts()

category
Finanzstatus                                     940
Kinder                                           793
politische Meinungen                              84
Gesundheitsdaten                                  54
religiöse oder weltanschauliche Überzeugungen     17
persönliche Schwächen                             11
Sexualleben/sexuelle Orientierung                  1
Name: count, dtype: int64

# Country frequency

In [8]:
df_full["country"].value_counts()

country
netherlands    376
germany        331
spain          317
france         290
sweden         183
italy          135
denmark        133
finland        104
greece          10
austria          8
belgium          5
portugal         4
croatia          2
poland           1
slovakia         1
Name: count, dtype: int64

# Data provider frequency

In [9]:
df_full.value_counts("provider_name")

provider_name
Audiences by Oracle (BlueKai, Datalogix, AddThis)    438
Eyeota                                               324
GroupM NL - GH 6924                                  235
ZeoTap                                               221
Adsquare (Data Provider)                             202
Lotame                                               124
Greenhouse Group B.V.                                 78
digitalAudience                                       70
Nielsen Marketing Cloud                               64
KBM Group                                             58
Audiens S.R.L.                                        19
Semasio GmbH (Data Provider)                          19
The Weather Channel, LLC                               9
The ADEX GmbH                                          8
LiveRamp Data Store                                    7
Tru Optik (CTV)                                        7
Weborama SA (Data Provider)                            6
Datmean (Data Pro

# Sort segments by country, broker and category

This was useful for some minor analyses

In [10]:
dirname_country = "eu_segments_by_country"

df_grouped_country = df_full.groupby("country")

for name, data in df_grouped_country.__iter__():
    data.to_csv(os.path.join(dirname_country, f"{name}.csv"))

In [11]:
dirname_broker = "eu_segments_by_broker"

len_threshold = 10

df_grouped_broker = df_full.groupby("provider_name")

df_other = pd.DataFrame(columns=df_full.columns)

for name, data in df_grouped_broker.__iter__():
    if len(data) < len_threshold:
        df_other = pd.concat([df_other, data])
    else:
        name = name.replace(",", "").replace(".", "").replace(" ", "_")
        data[["provider_name", "provider_id", "segment_name", "country"]].to_csv(os.path.join(dirname_broker, f"{name}.csv"))

df_other.index = df_other.index.set_names("id")
df_other.to_csv(os.path.join(dirname_broker, "other.csv"))

In [12]:
dirname_country = "eu_segments_by_category"

df_grouped_country = df_full.groupby("category")

for name, data in df_grouped_country.__iter__():
    filename = f"{re.sub(r'[ /]', '_', name)}.csv"
    print(filename)
    data.to_csv(os.path.join(dirname_country, filename))

Finanzstatus.csv
Gesundheitsdaten.csv
Kinder.csv
Sexualleben_sexuelle_Orientierung.csv
persönliche_Schwächen.csv
politische_Meinungen.csv
religiöse_oder_weltanschauliche_Überzeugungen.csv


# Create csv files to be read by DataWrapper

In [13]:
eu_countries = pd.read_json("eu_countries.json")
eu_countries = eu_countries.set_index("name")
eu_countries["hits"] = np.nan

In [14]:
highlights = pd.read_json("filet.json")

for category, data in df_full.groupby("category").__iter__():
    category_df = eu_countries[["hits", "name_de"]].reset_index()
    for countryname, count in data["country"].value_counts().items():
        category_df.loc[category_df["name"] == countryname, "hits"] = count
        for highlight_idx, highlight in enumerate(highlights[(highlights["category"] == category) & (highlights["country"] == countryname)]["filet"].values):
            category_df.loc[category_df["name"] == countryname, f"highlight_{highlight_idx}"] = " > ".join(highlight)
    category = re.sub(r'[/ ]', r"_", category)

    category_df.loc[category_df["name"] == "czechia", "name"] = "czech republic"
    category_df.drop(len(category_df)-1, inplace=True)
    category_df.set_index("name", drop=True, inplace=True)
    n_hits = category_df["hits"].sum()
    n_countries = len(category_df[category_df["hits"] > 0])
    print(f"{category}: {int(n_hits)} hits in {n_countries} countries.")
    category_df.to_csv(os.path.join("datawrapper_files", f"{category}.csv"))


Finanzstatus: 940 hits in 9 countries.
Gesundheitsdaten: 54 hits in 8 countries.
Kinder: 793 hits in 15 countries.
Sexualleben_sexuelle_Orientierung: 1 hits in 1 countries.
persönliche_Schwächen: 11 hits in 5 countries.
politische_Meinungen: 84 hits in 7 countries.
religiöse_oder_weltanschauliche_Überzeugungen: 17 hits in 6 countries.
