In [2]:
import csv
import json
import numpy as np
import pandas as pd
import re
from utils import *

In [3]:
df_orig = pd.read_csv("xandr_segments.csv", sep=",")
df_orig = df_orig.head(50000)

In [4]:
df_lower = df_orig.copy()
df_lower["Segment Name"] = df_orig.apply(lambda x: x["Segment Name"].lower(), axis=1)

In [5]:
provider_ids = df_orig["Data Provider ID"].unique()
provider_names = df_orig["Data Provider Name"].unique()

provider_names

array(['Nielsen Marketing Cloud', 'Datonics', 'AdAdvisor by Neustar',
       'Lotame', 'Peer39', 'KBM Group', 'Grapeshot', 'DoubleVerify',
       'Cross Pixel Media',
       'Audiences by Oracle (BlueKai, Datalogix, AddThis)'], dtype=object)

In [6]:
df_orig.head(5)["Segment Name"]

0    Consumer Targets - Interests - Auto and Other ...
1    Consumer Targets - Interests - Auto and Other ...
2    Consumer Targets - Custom Characteristics - So...
3    Consumer Targets - Custom Characteristics (Exe...
4    Bombora B2B Intent Signals - Human Resources -...
Name: Segment Name, dtype: object

In [7]:

separator_hierarchy = [">", "::", "-", "—"]




def itemize_segment_name(name: str):

    name = name.strip()

    # sometimes segment names are enquoted...
    if name[0] == name[-1] == '"':
        name = name[1:-1]

    items = []
    for sep in separator_hierarchy:
        if name.count(sep) > 1:
            items = name.split(sep)
            break
    else:
        return None

    items = [i.strip().lower() for i in items]
    return items

In [8]:
eu_countries = scrape_table(
    "https://ec.europa.eu/eurostat/statistics-explained/index.php?title=Glossary:Country_codes")
eu_countries = np.vstack([eu_countries.values[:, 0:2], eu_countries.values[:, 2:4], eu_countries.values[:, 4:6], eu_countries.values[:-1, 6:8]])
eu_countries[:,0] = [f.strip().lower() for f in eu_countries[:,0]]
eu_countries[:,1] = [f[1:3] for f in eu_countries[:,1]]

In [10]:
df_no_sep = pd.DataFrame(columns=df_orig.columns)
df_split = pd.DataFrame(columns=df_orig.columns)

for index,data in df_orig.head(10000).iterrows():
    segment = data["Segment Name"]

    

    if items := itemize_segment_name(segment):
        data["Segment Name"] = items
        df_split.loc[len(df_split)] = data
    else:
        df_no_sep.loc[len(df_no_sep)] = data


df_split



Unnamed: 0,Data Provider Name,Data Provider ID,Segment ID,Segment Name
0,Nielsen Marketing Cloud,39,4015114,"[consumer targets, interests, auto and other v..."
1,Nielsen Marketing Cloud,39,4015099,"[consumer targets, interests, auto and other v..."
2,Nielsen Marketing Cloud,39,4015077,"[consumer targets, custom characteristics, sof..."
3,Nielsen Marketing Cloud,39,5174108,"[bombora b2b intent signals, human resources, ..."
4,Nielsen Marketing Cloud,39,1792609,"[b2b targets, seniority, support (exelate)]"
...,...,...,...,...
6119,Lotame,422,8059113,"[mobilewalla, philippines, christmas / boxing ..."
6120,Lotame,422,8059147,"[mobilewalla, philippines, retail, cpg buyers]"
6121,Lotame,422,8058968,"[mobilewalla, philippines, department store va..."
6122,Lotame,422,8059111,"[mobilewalla, philippines, fireworks shoppers]"


In [11]:
df_no_sep["Segment Name"].array

<PandasArray>
[                        'Consumer Targets - Custom Characteristics (Exelate)',
                                           'Test Segment 1 - Client (Exelate)',
                                           'Test Segment 2 - Client (Exelate)',
                                           'Test Segment 3 - Client (Exelate)',
                          'Finance & Money > Credit & Credit Cards (Datonics)',
                                       'Finance & Money > Mortgage (Datonics)',
                                               'Neustar AdAdvisor Element 001',
                                               'Neustar AdAdvisor Element 002',
                                               'Neustar AdAdvisor Element 003',
                                               'Neustar AdAdvisor Element 004',
 ...
                                       'Infogroup - Product Purchases > Tools',
                                      'Infogroup - Product Purchases > Travel',
                     

In [12]:


df_travelling = pd.DataFrame(columns=df_orig.columns)
df_country = pd.DataFrame(columns=df_orig.columns)


for index, data in df_split.iterrows():
    segments = data["Segment Name"]
    
    

In [22]:

taboo_set = {"travel", "departure", "destination", "tourism", "vacation"}
filtered_travel_words = df_lower[df_lower.apply(lambda x: not any( taboo_word in x["Segment Name"] for taboo_word in taboo_set), axis=1)]

In [18]:
df_lower[df_lower.apply(lambda x: any( taboo_word in x["Segment Name"] for taboo_word in taboo_set), axis=1)]["Segment Name"].array

<PandasArray>
[                             'consumer targets - custom characteristics - premium interest - household - travel (exelate)',
                                           'b2b targets - industry - transportation and travel - travel agencies (exelate)',
                                            'b2b targets - industry - transportation and travel - ground transit (exelate)',
                                               'b2b targets - industry - transportation and travel - warehousing (exelate)',
                     'tech targets - from whotoo - industry - transportation and travel - trucking and logistics (exelate)',
                                        'b2b targets - industry - transportation and travel - air transportation (exelate)',
                                     'tech targets - from whotoo - industry - transportation and travel - marine (exelate)',
                             'tech targets - from whotoo - industry - transportation and travel - ground transi

In [20]:
eu_names = eu_countries[:,0]
eu_names

array(['belgium', 'bulgaria', 'czechia', 'denmark', 'germany', 'estonia',
       'ireland', 'greece', 'spain', 'france', 'croatia', 'italy',
       'cyprus', 'latvia', 'lithuania', 'luxembourg', 'hungary', 'malta',
       'netherlands', 'austria', 'poland', 'portugal', 'romania',
       'slovenia', 'slovakia', 'finland', 'sweden'], dtype=object)

In [23]:
filtered_eu = filtered_travel_words[filtered_travel_words.apply(lambda x: any( country in x["Segment Name"] for country in eu_names), axis=1)]

In [25]:
filtered_eu["Segment Name"].array

<PandasArray>
[                                            'grocery > sparkling water > sparkling waters-brand use most-poland spring',
  'neustar adadvisor > alcohol (restricted to best practices) > beer & ale > purchase beers & ales: new belgium brewery',
                                                                                               'tour de france (lotame)',
                                                                         'international_eu - austria all users (lotame)',
                                                                     'international_eu - austria art & culture (lotame)',
                                                                       'international_eu - austria automobiles (lotame)',
                                                                          'international_eu - austria business (lotame)',
                                                          'international_eu - austria computers and technology (lotame)',
          