In [12]:
import csv
import json
import numpy as np
import pandas as pd
import re
import utils
from collections import OrderedDict, Counter


In [13]:
df_orig = pd.read_json("xandr_segments_itemized.json") # load data

In [14]:
eu_countries = pd.read_csv("eu_countries.csv")

eu_names_re = "|".join(eu_countries[["country", "ethnicity"]].values.flatten())
eu_codes = set(eu_countries["code"])

In [15]:
tag_count = Counter(np.concatenate(df_orig["name_list"].values))
with open("tags.json", "w") as f:
    json.dump(OrderedDict(tag_count.most_common()), f, indent=4)

In [16]:
travel_word_list = ["travel", "departure", "destination", "tourism", "tourist", "vacation", "holiday", "voyage", "expedia", "visit"]
travel_word_re = "|".join(travel_word_list)
filtered_travel_words = df_orig[df_orig.apply(lambda x: not re.search(travel_word_re, x["name"]), axis=1)]

In [16]:
us_re = "united states"

def filter_us_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(us_re, row["name"]) or any([el == "usa" or el.startswith("us ") or el.endswith(" us") for el in row["name_list"]]))

filtered_us = filtered_travel_words[filtered_travel_words.apply(filter_us_names, axis=1)]

In [17]:
filtered_us.to_csv("filtered_us.csv")

In [17]:
def filter_eu_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(eu_names_re, row["name"]) or len(eu_codes.intersection(row["name_list"])))


filtered_eu = filtered_travel_words[filtered_travel_words.apply(filter_eu_names, axis=1)]

In [7]:
filtered_eu.to_csv("filtered_eu.csv")

In [8]:
with open("filtered_eu_segnames.json", "w") as f:
    json.dump(list(filtered_eu["name"].array), f)

In [9]:
with open("foo.json", "w") as f:
    json.dump(list(df_orig[df_orig.apply(lambda x: "international_eu" in x["name"], axis=1)]["name"].array), f)

In [10]:
spicy_words = ['sex',  # sexuality
               'promiscu',
               'porn',
               'erotic',
               'lgbt',
               'lgbtq',
               'lesbian',
               'gay',
               'bisexual',
               'transgender',
               'queer',

               'poor',  # financial
               'poverty',
               'struggling',
               'gambling',
               'betting',
               'credit level',
               'credit score',

               'ethnic',  # ethnicity
               'multicultural',
               'arab',

               'religio',  # religion
               'faith',
               'christian',
               'muslim',
               'islam'
               'jew',
               'judais',

               'health',  # health
               'psycho',
               'psychiat',
               'autis',
               'depress',
               'weight',
               'obese',
               'obesity',
               'mental',

               'unionized',  # labor union
               'labor union',
               'trade union',

               'politic',  # political opinions
               'military',
               'police',
               'policy',
               'crime',
               'migrants',
               'criminal',
               'ukraine',
               'russia',
               'covid',
               'corona',
               ]
spicy_word_re = "|".join(spicy_words)


In [11]:
spicy_dict = {word:{} for word in spicy_words}
for index, row in filtered_eu.iterrows():
    segname = row["name"]
    provider = row["provider_name"]
    res = re.findall(spicy_word_re, segname)
    if res:
        res = set(res)
        for tag in res:
            if provider in spicy_dict[tag]:
                spicy_dict[tag][provider].append(segname)
            else:
                spicy_dict[tag][provider] = [segname]



In [12]:

for k,v in spicy_dict.items():
    for k2, v2 in v.items():
        v[k2] = sorted(v2)


with open("spicy_words_eu.json", "w") as f:
    json.dump(spicy_dict, f, indent=4)

['geographic > ip based > country > europe > northern europe > united kingdom > england > west sussex (bluekai)', 'geographic > ip based > country > europe > northern europe > united kingdom > england > essex (bluekai)', 'geographic > ip based > country > europe > northern europe > united kingdom > england > east sussex (bluekai)']
['zeotap > spain > interest > lgbt']
['international_eu - france credit level - poor (lotame)', 'international_eu - germany credit level - poor (lotame)', 'international_eu - italy credit level - poor (lotame)', 'international_eu - spain credit level - poor (lotame)', 'international_eu - united kingdom credit level - poor (lotame)']
['household income > a: poorer\xa0households - lowest 20% (digiseg is a real-time data provider = no loads displayed but full reach in spain, netherland, sweden, norway, denmark, hungary (es nl se no dk hu) incl. ios safari ctv audio video mobile and display)']
['branded data > nordic data resources > sweden > mosaic > (f) strugg

In [13]:
spicy_dict_global = {word:{} for word in spicy_words}
for index, row in df_orig.iterrows():
    segname = row["name"]
    provider = row["provider_name"]
    res = re.findall(spicy_word_re, segname)
    if res:
        res = set(res)
        for tag in res:
            if provider in spicy_dict_global[tag]:
                spicy_dict_global[tag][provider].append(segname)
            else:
                spicy_dict_global[tag][provider] = [segname]


for k,v in spicy_dict_global.items():
    for k2, v2 in v.items():
        v[k2] = sorted(v2)
    # spicy_dict[k] = sorted(v)


with open("spicy_words_global.json", "w") as f:
    json.dump(spicy_dict_global, f, indent=4)

In [21]:
countries = eu_countries["country"].values
codes = eu_countries["code"].values
ethnicities = eu_countries["ethnicity"].values

In [23]:
eu_counter = {country: [] for country in eu_countries["country"]}

for index, row in filtered_travel_words.iterrows():
    for i, country in enumerate(countries):
        segname = row["name"]
        
        if country in segname or ethnicities[i] in segname or codes[i] in row["name_list"]:
            eu_counter[country].append(segname)

In [27]:
counts = OrderedDict([(k, len(v)) for k,v in eu_counter.items()])
for a, b in sorted(counts.items(), key= lambda x: x[1], reverse=True):
    print(a, b)


france 9088
spain 8721
germany 6380
portugal 4640
italy 3515
netherlands 3395
europe 2627
sweden 1694
denmark 1126
austria 851
malta 842
finland 556
greece 513
poland 463
croatia 419
belgium 274
ireland 163
romania 152
hungary 146
czechia 82
estonia 81
slovakia 66
bulgaria 53
slovenia 47
lithuania 35
latvia 33
cyprus 19
luxembourg 18
