In [1]:
import csv
import json
import numpy as np
import pandas as pd
import re
import utils
from collections import OrderedDict, Counter


In [3]:
df_orig = pd.read_json("xandr_segments_itemized.json") # load data

In [4]:
eu_countries = utils.scrape_table("https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Country_codes").values
eu_countries = np.vstack([eu_countries[:, 0:2], eu_countries[:, 2:4], eu_countries[:, 4:6], eu_countries[:-1, 6:8]])
eu_countries[:,0] = [countryname.strip().lower() for countryname in eu_countries[:,0]] # format all the names
eu_countries[:,1] = [countrycode[1:3] for countrycode in eu_countries[:,1]] # remove parentheses from abbreviations
eu_countries = np.vstack([eu_countries, ["europe", "eu"]])

eu_countries_set = set(eu_countries[:,0].flatten())

In [6]:
tag_count = Counter(np.concatenate(df_orig["name_list"].values))
with open("tags.json", "w") as f:
    json.dump(OrderedDict(tag_count.most_common()), f, indent=4)

In [7]:
travel_word_list = ["travel", "departure", "destination", "tourism", "tourist", "vacation", "holiday", "voyage", "expedia", "visit"]
travel_word_re = "|".join(travel_word_list)
filtered_travel_words = df_orig[df_orig.apply(lambda x: not re.search(travel_word_re, x["name"]), axis=1)]

In [8]:
eu_names_re = "|".join(eu_countries[:,0])
eu_codes = set(eu_countries[:,1])

In [9]:
def filter_eu_names(row):
    # does the full country name occur anywhere or does a countrycode match an item exactly?
    return bool(re.search(eu_names_re, row["name"]) or len(eu_codes.intersection(row["name_list"])))


filtered_eu = filtered_travel_words[filtered_travel_words.apply(filter_eu_names, axis=1)]

In [62]:
filtered_eu.to_csv("filtered_eu.csv")

In [13]:
with open("filtered_eu_segnames.json", "w") as f:
    json.dump(list(filtered_eu["name"].array), f)

In [55]:
with open("foo.json", "w") as f:
    json.dump(list(df_orig[df_orig.apply(lambda x: "international_eu" in x["name"], axis=1)]["name"].array), f)

In [10]:
spicy_words = ['sex',  # sexuality
               'promiscu',
               'erotic',
               'lgbt',
               'lgbtq',
               'lesbian',
               'gay',
               'bisexual',
               'transgender',
               'queer',

               'poor',  # financial
               'poverty',
               'struggling',
               'gambling',
               'betting',
               'credit level',
               'credit score',

               'ethnic',  # ethnicity
               'multicultural',

               'religio',  # religion
               'faith',
               'christian',
               'muslim',
               'jew',
               'judais',

               'health',  # health
               'psycho',
               'psychiat',
               'autis',
               'depress',
               'weight',
               'obese',
               'obesity',
               'mental',

               'unionized',  # labor union
               'labor union',
               'trade union',

               'politic',  # political opinions
               'military',
               'police',
               'policy',
               'crime',
               'migrants',
               'criminal',
               ]
spicy_word_re = "|".join(spicy_words)


In [27]:
spicy_dict = {word:[] for word in spicy_words}

for index, row in filtered_eu.iterrows():
    segname = row["name"]
    res = re.findall(spicy_word_re, segname)
    if res:
        res = set(res)
        for tag in res:
            spicy_dict[tag].append(segname)

In [28]:
with open("spicy_words_eu.json", "w") as f:
    json.dump(spicy_dict, f)