In [1]:
import csv
import json
import numpy as np
import pandas as pd
import re
import utils
from collections import OrderedDict, Counter
import ast
import swifter

In [2]:
df_itemized = pd.read_json("xandr_segments_itemized.json") # load data

In [3]:
eu_countries = pd.read_csv("eu_countries.csv")

eu_countries["names_re"] = np.nan
eu_countries["code_set"] = np.nan

eu_countries["codes"] = eu_countries["codes"].apply(ast.literal_eval)
eu_countries["strings"] = eu_countries["strings"].apply(ast.literal_eval)
eu_countries["identifiers"] = eu_countries["identifiers"].apply(ast.literal_eval)
eu_names_re = "|".join(np.concatenate(eu_countries[["strings", "identifiers"]].values.flatten()))
eu_codes = set(np.concatenate(eu_countries["codes"].values.flatten()))

type: unterminated string literal (detected at line 1) (<unknown>, line 1)

In [None]:
tag_count = Counter(np.concatenate(df_itemized["name_list"].values))
with open("tags.json", "w") as f:
    json.dump(OrderedDict(tag_count.most_common()), f, indent=4)

In [4]:
travel_word_list = ["travel", "departure", "destination",
                    "tourism", "tourist", "vacation", "holiday", "voyage", "expedia"]

eu_false_positives = ["(furniture|nail) polish",  # ethnicity FPs
                      "irish (whiskey|cream)",
                      "speak(er|ing)",
                      # "language",
                      # "hispanic",
                      "tour de france",
                      "greek joghurt",

                      # Country code FPs
                      "accuen",  # does market research and very little location-specifics. Thus many FPs
                      "xaxisus",  # us source
                      "xaxisca",  # canadian source
                      "xaxisapc",  # whatever the fuck it is, it's not useful
                      "tailtarget",  # mostly latAm focussed -> numerous es/pt FPs
                      "\A\d{20}",
                      ]


false_positive_re = "|".join(travel_word_list + eu_false_positives)
filtered_travel_words = df_itemized[df_itemized.swifter.apply(
    lambda x: not re.search(false_positive_re, x["name"]), axis=1)]

Pandas Apply:   0%|          | 0/648930 [00:00<?, ?it/s]

In [6]:
us_re = r"united[ -–_]?states|usa"

def filter_us_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(us_re, row["name"]) or any([el == "usa" or el.startswith("us ") or el.endswith(" us") for el in row["name_list"]]))

filtered_us = filtered_travel_words[filtered_travel_words.swifter.apply(filter_us_names, axis=1)]
filtered_us.to_csv("filtered_us.csv")

Pandas Apply:   0%|          | 0/622147 [00:00<?, ?it/s]

In [5]:
def filter_eu_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(eu_names_re, row["name"]) or len(eu_codes.intersection(row["name_list"])))

def filter_eu_codes(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return len(eu_codes.intersection(row["name_list"])) > 0

eu_ethnicities_re = "|".join([x[1] for x in eu_countries["strings"].values])


def filter_eu_ethnicities(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(eu_ethnicities_re, row["name"]))


filtered_eu = filtered_travel_words[filtered_travel_words.swifter.apply(filter_eu_names, axis=1)]

Pandas Apply:   0%|          | 0/609261 [00:00<?, ?it/s]

In [6]:
filtered_eu.to_csv("filtered_eu.csv")

In [44]:
filtered_travel_words[filtered_travel_words.swifter.apply(filter_eu_codes, axis=1)].to_csv("filtered_eu_codes.csv")
filtered_travel_words[filtered_travel_words.swifter.apply(filter_eu_ethnicities, axis=1)].to_csv("filtered_eu_ethnicities.csv")

Pandas Apply:   0%|          | 0/622147 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/622147 [00:00<?, ?it/s]

In [45]:
with open("filtered_eu_segnames.json", "w") as f:
    json.dump(list(filtered_eu["name"].array), f)

In [33]:
spicy_dict = {word:{} for word in spicy_words}
for index, row in filtered_eu.iterrows():
    segname = row["name"]
    provider = row["provider_name"]
    res = re.findall(spicy_word_re, segname)
    if res:
        res = set(res)
        for tag in res:
            if provider in spicy_dict[tag]:
                spicy_dict[tag][provider].append(segname)
            else:
                spicy_dict[tag][provider] = [segname]



In [34]:

for k,v in spicy_dict.items():
    for k2, v2 in v.items():
        v[k2] = sorted(v2)


with open("spicy_words_eu.json", "w") as f:
    json.dump(spicy_dict, f, indent=4)

In [13]:
spicy_dict_global = {word:{} for word in spicy_words}
for index, row in df_itemized.iterrows():
    segname = row["name"]
    provider = row["provider_name"]
    res = re.findall(spicy_word_re, segname)
    if res:
        res = set(res)
        for tag in res:
            if provider in spicy_dict_global[tag]:
                spicy_dict_global[tag][provider].append(segname)
            else:
                spicy_dict_global[tag][provider] = [segname]


for k,v in spicy_dict_global.items():
    for k2, v2 in v.items():
        v[k2] = sorted(v2)
    # spicy_dict[k] = sorted(v)


with open("spicy_words_global.json", "w") as f:
    json.dump(spicy_dict_global, f, indent=4)

In [7]:
countries = eu_countries["name"].values
strings = eu_countries["strings"].values
codes = eu_countries["codes"].values
identifiers = eu_countries["identifiers"].values

In [8]:
eu_counter = {country: [] for country in countries}

country_re = ["|".join(strings[i]+identifiers[i]) for i in range(len(strings))]

for index, row in filtered_travel_words.iterrows():
    for i, country in enumerate(countries):
        segname = row["name"]
        
        if re.search(country_re[i], segname) or codes[i] in row["name_list"]:
            eu_counter[country].append(segname)


In [9]:
counts = OrderedDict([(k, len(v)) for k,v in eu_counter.items()])
for a, b in sorted(counts.items(), key= lambda x: x[1], reverse=True):
    print(a, b)


germany 4849
spain 3528
france 3371
italy 2288
denmark 1953
sweden 1398
europe 1037
poland 544
finland 507
austria 477
belgium 455
romania 406
netherlands 374
hungary 268
portugal 211
greece 155
ireland 117
czechia 98
croatia 28
malta 25
bulgaria 18
slovakia 18
estonia 15
cyprus 15
slovenia 14
lithuania 13
latvia 12
luxembourg 11


In [10]:
sum([i[1] for i in counts.items()])

22205