In [29]:
import csv
import json
import numpy as np
import pandas as pd
import re
import utils
from collections import OrderedDict, Counter
import ast
import swifter

In [3]:
df_orig = pd.read_json("xandr_segments_itemized.json") # load data

In [25]:
eu_countries = pd.read_csv("eu_countries.csv")

eu_countries["names_re"] = np.nan
eu_countries["code_set"] = np.nan

eu_countries["codes"] = eu_countries["codes"].apply(lambda x: ast.literal_eval(x))
eu_countries["strings"] = eu_countries["strings"].apply(lambda x: ast.literal_eval(x))
eu_names_re = "|".join(np.concatenate(eu_countries["strings"].values.flatten()))
eu_codes = set(np.concatenate(eu_countries["codes"].values.flatten()))

In [26]:
tag_count = Counter(np.concatenate(df_orig["name_list"].values))
with open("tags.json", "w") as f:
    json.dump(OrderedDict(tag_count.most_common()), f, indent=4)

In [27]:
travel_word_list = ["travel", "departure", "destination", "tourism", "tourist", "vacation", "holiday", "voyage", "expedia"]
travel_word_re = "|".join(travel_word_list)
filtered_travel_words = df_orig[df_orig.swifter.apply(lambda x: not re.search(travel_word_re, x["name"]), axis=1)]

In [11]:
us_re = "united states|unitedstates|usa"

def filter_us_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(us_re, row["name"]) or any([el == "usa" or el.startswith("us ") or el.endswith(" us") for el in row["name_list"]]))

filtered_us = filtered_travel_words[filtered_travel_words.swifter.apply(filter_us_names, axis=1)]
filtered_us.to_csv("filtered_us.csv")

In [30]:
def filter_eu_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(eu_names_re, row["name"]) or len(eu_codes.intersection(row["name_list"])))


filtered_eu = filtered_travel_words[filtered_travel_words.swifter.apply(filter_eu_names, axis=1)]

Pandas Apply:   0%|          | 0/622147 [00:00<?, ?it/s]

In [36]:
filtered_eu.to_csv("filtered_eu.csv")

In [8]:
with open("filtered_eu_segnames.json", "w") as f:
    json.dump(list(filtered_eu["name"].array), f)

In [9]:
with open("foo.json", "w") as f:
    json.dump(list(df_orig[df_orig.apply(lambda x: "international_eu" in x["name"], axis=1)]["name"].array), f)

In [35]:
spicy_words = ['sex',  # sexuality
               'promiscu',
               'porn',
               'erotic',
               'lgbt',
               'lgbtq',
               'lesbian',
               'gay',
               'bisexual',
               'transgender',
               'queer',

               'poor',  # financial
               'poverty',
               'struggling',
               'gambling',
               'betting',
               'credit level',
               'credit score',

               'ethnic',  # ethnicity
               'multicultural',
               'arab',

               'religio',  # religion
               'faith',
               'christian',
               'muslim',
               'islam'
               'jew',
               'judais',

               'health',  # health
               'psycho',
               'psychiat',
               'autis',
               'depress',
               'weight',
               'obese',
               'obesity',
               'mental',

               'unionized',  # labor union
               'labor union',
               'trade union',

               'politic',  # political opinions
               'military',
               'police',
               'policy',
               'crime',
               'migrants',
               'criminal',
               'ukraine',
               'russia',
               'covid',
               'corona',
               ]
spicy_word_re = re.compile("|".join(spicy_words))


In [33]:
spicy_dict = {word:{} for word in spicy_words}
for index, row in filtered_eu.iterrows():
    segname = row["name"]
    provider = row["provider_name"]
    res = re.findall(spicy_word_re, segname)
    if res:
        res = set(res)
        for tag in res:
            if provider in spicy_dict[tag]:
                spicy_dict[tag][provider].append(segname)
            else:
                spicy_dict[tag][provider] = [segname]



In [34]:

for k,v in spicy_dict.items():
    for k2, v2 in v.items():
        v[k2] = sorted(v2)


with open("spicy_words_eu.json", "w") as f:
    json.dump(spicy_dict, f, indent=4)

In [13]:
spicy_dict_global = {word:{} for word in spicy_words}
for index, row in df_orig.iterrows():
    segname = row["name"]
    provider = row["provider_name"]
    res = re.findall(spicy_word_re, segname)
    if res:
        res = set(res)
        for tag in res:
            if provider in spicy_dict_global[tag]:
                spicy_dict_global[tag][provider].append(segname)
            else:
                spicy_dict_global[tag][provider] = [segname]


for k,v in spicy_dict_global.items():
    for k2, v2 in v.items():
        v[k2] = sorted(v2)
    # spicy_dict[k] = sorted(v)


with open("spicy_words_global.json", "w") as f:
    json.dump(spicy_dict_global, f, indent=4)

In [14]:
countries = eu_countries["country"].values
codes = eu_countries["code"].values
ethnicities = eu_countries["ethnicity"].values

In [18]:
eu_counter = {country: [] for country in eu_countries["country"]}

for index, row in filtered_travel_words.iterrows():
    for i, country in enumerate(countries):
        segname = row["name"]
        
        if country in segname or ethnicities[i] in segname or codes[i] in row["name_list"]:
            eu_counter[country].append(segname)
            break

In [19]:
counts = OrderedDict([(k, len(v)) for k,v in eu_counter.items()])
for a, b in sorted(counts.items(), key= lambda x: x[1], reverse=True):
    print(a, b)


france 8901
spain 8657
germany 6380
italy 3425
netherlands 3304
sweden 1661
portugal 1423
denmark 1126
europe 988
malta 837
austria 810
finland 553
greece 511
poland 454
croatia 413
belgium 274
ireland 163
romania 147
hungary 122
czechia 82
estonia 81
slovakia 63
bulgaria 53
slovenia 47
lithuania 35
latvia 33
cyprus 19
luxembourg 18


In [20]:
sum([i[1] for i in counts.items()])

40580