In [22]:
import csv
import json
import numpy as np
import pandas as pd
import re
import utils
from collections import OrderedDict, Counter
import ast
import swifter

In [23]:
df_itemized = pd.read_json("xandr_segments_itemized.json") # load data

In [24]:
eu_countries = pd.read_csv("eu_countries.csv")


eu_countries["codes"] = eu_countries["codes"].apply(ast.literal_eval)
eu_countries["codes_set"] = eu_countries["codes"].apply(lambda x: set(x))
eu_countries["strings"] = eu_countries["strings"].apply(ast.literal_eval)
eu_countries["identifiers"] = eu_countries["identifiers"].apply(ast.literal_eval)
eu_countries["names_re"] = eu_countries.swifter.apply(lambda row: utils.list_to_regex(np.concatenate(row[["strings", "identifiers"]].values.flatten())), axis=1)

eu_names_re = utils.list_to_regex(np.concatenate(eu_countries[["strings", "identifiers"]].values.flatten()))
eu_codes = set(np.concatenate(eu_countries["codes"].values.flatten()))
eu_countries

Pandas Apply:   0%|          | 0/28 [00:00<?, ?it/s]

Unnamed: 0.1,Unnamed: 0,name,strings,codes,identifiers,codes_set,names_re
0,0,belgium,"[belgium, belgian, belgie]","[be, bel]","[xaxisbe, xaxisbel]","{bel, be}",re.compile('belgium|belgian|belgie|xaxisbe|xax...
1,1,bulgaria,"[bulgaria, bulgarian, bulgarija]","[bg, bgr]",[],"{bg, bgr}",re.compile('bulgaria|bulgarian|bulgarija')
2,2,czechia,"[czechia, czech, cesko, ceska]","[cz, cze]",[xaxiscz],"{cz, cze}",re.compile('czechia|czech|cesko|ceska|xaxiscz')
3,3,denmark,"[denmark, danish, danmark]","[dk, dnk]","[xaxisdk, dk ndr]","{dnk, dk}",re.compile('denmark|danish|danmark|xaxisdk|dk[...
4,4,germany,"[germany, german, deutschland]","[de, deu, ger]","[xaxisde, de experian, de kantar]","{de, ger, deu}",re.compile('germany|german|deutschland|xaxisde...
5,5,estonia,"[estonia, estonian, eesti]","[ee, est]",[],"{ee, est}",re.compile('estonia|estonian|eesti')
6,6,ireland,"[ireland, irish]","[ie, irl]",[],"{irl, ie}",re.compile('ireland|irish')
7,7,greece,"[greece, greek]","[el, grc]",[],"{grc, el}",re.compile('greece|greek')
8,8,spain,"[spain, spanish]","[es, esp]","[xaxises, es experian, experian sp]","{es, esp}",re.compile('spain|spanish|xaxises|es[\\ \\-–_\...
9,9,france,"[france, french, francaise]","[fr, fra]","[xaxisfr, fr experian, experian fr, fr kantar]","{fra, fr}",re.compile('france|french|francaise|xaxisfr|fr...


In [25]:
tag_count = Counter(np.concatenate(df_itemized["name_list"].values))
with open("tags.json", "w") as f:
    json.dump(OrderedDict(tag_count.most_common()), f, indent=4)

In [26]:
travel_word_list = ["travel", "departure", "destination",
                    "tourism", "tourist", "vacation", "holiday", "voyage", "expedia"]

eu_false_positives = ["(furniture|nail) polish",  # ethnicity FPs
                      "irish (whiskey|cream)",
                      "speak(er|ing)",
                      # "language",
                      # "hispanic",
                      "tour de france",
                      "greek joghurt",

                      # Country code FPs
                      "accuen",  # does market research and very little location-specifics. Thus many FPs
                      "xaxisus",  # us source
                      "xaxisca",  # canadian source
                      "xaxisapc",  # whatever the fuck it is, it's not useful
                      "tailtarget",  # mostly latAm focussed -> numerous es/pt FPs
                      ]


false_positive_re = utils.list_to_regex(travel_word_list + eu_false_positives)
filtered_travel_words = df_itemized[df_itemized.swifter.apply(
    lambda x: not re.search(false_positive_re, x["name"]), axis=1)]

Pandas Apply:   0%|          | 0/648930 [00:00<?, ?it/s]

In [28]:
def filter_eu_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(eu_names_re, row["name"]) or len(eu_codes.intersection(row["name_list"])))


names_re_list = eu_countries["names_re"].values
codes_set_list = eu_countries["codes_set"].values
country_names = eu_countries["name"].values

def identify_eu_names(row_dataset):
    name = row_dataset["name"]
    country_hits = []
    for i, names_re in enumerate(names_re_list):
         
        if re.search(names_re, name) or len(codes_set_list[i].intersection(row_dataset["name_list"])):
            country_hits.append(country_names[i])
    
    row_dataset["countries"] = country_hits
    return row_dataset


filtered_travel_words = filtered_travel_words.swifter.apply(identify_eu_names, axis=1)
filtered_eu = filtered_travel_words[filtered_travel_words.swifter.apply(lambda x: len(x["countries"])>0, axis=1)]

Pandas Apply:   0%|          | 0/613906 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [29]:
filtered_eu = filtered_travel_words[filtered_travel_words.swifter.apply(lambda x: len(x["countries"])>0, axis=1)]

Pandas Apply:   0%|          | 0/613906 [00:00<?, ?it/s]

In [30]:
filtered_eu.to_csv("filtered_eu.csv")

In [44]:


def filter_eu_codes(row):
    return len(eu_codes.intersection(row["name_list"])) > 0

eu_ethnicities_re = utils.list_to_regex([x[1] for x in eu_countries["strings"].values])

def filter_eu_ethnicities(row):
    return bool(re.search(eu_ethnicities_re, row["name"]))


filtered_travel_words[filtered_travel_words.swifter.apply(filter_eu_codes, axis=1)].to_csv("filtered_eu_codes.csv")
filtered_travel_words[filtered_travel_words.swifter.apply(filter_eu_ethnicities, axis=1)].to_csv("filtered_eu_ethnicities.csv")

Pandas Apply:   0%|          | 0/622147 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/622147 [00:00<?, ?it/s]

In [45]:
with open("filtered_eu_segnames.json", "w") as f:
    json.dump(list(filtered_eu["name"].array), f)

In [7]:
countries = eu_countries["name"].values
strings = eu_countries["strings"].values
codes = eu_countries["codes"].values
identifiers = eu_countries["identifiers"].values

In [8]:
eu_counter = {country: [] for country in countries}

country_re = [utils.list_to_regex(strings[i]+identifiers[i]) for i in range(len(strings))]

for index, row_dataset in filtered_travel_words.iterrows():
    for i, country in enumerate(countries):
        segname = row_dataset["name"]
        
        if re.search(country_re[i], segname) or codes[i] in row_dataset["name_list"]:
            eu_counter[country].append(segname)


In [9]:
counts = OrderedDict([(k, len(v)) for k,v in eu_counter.items()])
for a, b in sorted(counts.items(), key= lambda x: x[1], reverse=True):
    print(a, b)


germany 4849
spain 3528
france 3371
italy 2288
denmark 1953
sweden 1398
europe 1037
poland 544
finland 507
austria 477
belgium 455
romania 406
netherlands 374
hungary 268
portugal 211
greece 155
ireland 117
czechia 98
croatia 28
malta 25
bulgaria 18
slovakia 18
estonia 15
cyprus 15
slovenia 14
lithuania 13
latvia 12
luxembourg 11


In [10]:
sum([i[1] for i in counts.items()])

22205