In [11]:
import csv
import json
import numpy as np
import pandas as pd
import re
import utils
from collections import Counter
import swifter

In [18]:
df_orig = pd.read_csv("xandr_segments.csv", sep=",") # load data
# rename columns
df_orig = df_orig.rename(columns={"Segment Name": "name", "Data Provider Name": "provider_name","Data Provider ID": "provider_id","Segment ID" : "id"})
df_orig["name_list"] = np.nan # introduce empty column for itemized segment name
df_orig["data_source"] = np.nan # data source in brackets at the end of a segment name

In [19]:
provider_ids = df_orig["provider_id"].unique()
provider_names = df_orig["provider_name"].unique()
Counter(df_orig["provider_name"]).most_common(50)

In [21]:
df_orig["name"] = df_orig.swifter.apply(lambda row: utils.clean_segment_name(row["name"]), axis = 1)

In [22]:
# Extract data source from the end of a segment name (written in parentheses if present)
data_source_re = re.compile(r"\([\w ]{3,25}\)$")
def extract_data_source(row):
    name = row["name"]
    if data_source_match := re.search(data_source_re, name): # search for a data source at the end of the segment name
            row["data_source"] = data_source_match.group(0)[1:-1] # enter the data source into the corrseponding column (without brackets)
            row["name"] = name[:data_source_match.start(0)] # crop the existing segment name
    
    return row

df_orig = df_orig.swifter.apply(extract_data_source, axis=1)


# Segments by a few brokers are multilingual which messes up all kinds of upcoming operations. All of them follow the same principle
# and are simply cut off here 
pruning_re = re.compile(r"\(en\).+\(es\).+\(pt\)")
def prune_segname(row):
    name = row["name"]
    if match := re.search(pruning_re, name):
         row["name"] = name[:match.start(0)]
    return row


df_orig = df_orig.swifter.apply(prune_segname, axis=1)


Pandas Apply:   0%|          | 0/651463 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/651463 [00:00<?, ?it/s]

In [23]:
# itemize all segment names
df_orig["name_list"] = df_orig.swifter.apply(lambda row: utils.itemize_segment_name(row["name"]), axis = 1)

Pandas Apply:   0%|          | 0/651463 [00:00<?, ?it/s]

In [24]:
utils.useless_segments_re = re.compile(r"\[null\]|\Atest|[^a-z]+test\Z| test |automation\d{5,10}")


In [25]:
useless_segment_indices = df_orig.swifter.apply(lambda x: bool(re.search(utils.useless_segments_re, x["name"])), axis = 1)

df_useless_segments = df_orig[useless_segment_indices].reset_index(drop=True)
df_useless_segments.to_csv("useless_segments.csv")

df_filtered = df_orig[useless_segment_indices.__invert__()]
df_filtered.to_csv('xandr_segments_itemized.csv')
df_filtered.to_json('xandr_segments_itemized.json')

Pandas Apply:   0%|          | 0/651463 [00:00<?, ?it/s]

In [26]:
df_failed_segmentation = df_filtered[df_filtered.swifter.apply(lambda x: len(x["name_list"]) < 2, axis = 1)]
df_failed_segmentation = df_failed_segmentation.reset_index(drop=True)
df_failed_segmentation.to_csv('xandr_segments_failed_itemization.csv')

Pandas Apply:   0%|          | 0/649110 [00:00<?, ?it/s]