In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# 🧹Cleaning cleaning functions

The company name cleaning function I've been working with explodes to 30GB in memory. It seriously shouldn't. Worth a refactor.

In [4]:
from src import locations as loc
from src.data import utils as du
from src.data.star import Star
from src.data.datasets import Dataset
from src.data.probabilities import Probabilities
from src.data.clusters import Clusters
from src.link.splink_linker import SplinkLinker
from src.config import link_pipeline, stopwords
from src.features.clean_complex import *
from src.features.clean_basic import *

import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

from dotenv import load_dotenv, find_dotenv
from pathlib import Path
import os
import duckdb
import pandas as pd

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

## Setup

Grab some data.

In [3]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)
# cl_x_exp=SplinkLinker.load(
#     path=Path(loc.DATA_SUBDIR['raw'], 'ch_x_exp.pickle')
# )

In [4]:
# cl_x_exp = SplinkLinker(
#     dataset = Dataset(
#         star_id=54717,
#         star=star
#     ), 
#     probabilities=probabilities, 
#     clusters=clusters, 
#     n=2
# )
# cl_x_exp.get_data(
#     cluster_select={
#         '"companieshouse"."companies"': [
#             "company_name as company_name",
#             "postcode as postcode"
#         ]
#     },
#     dim_select=[
#         "id",
#         "company_name",
#         "postcode"
#     ]
# )
# cl_x_exp.save(path=Path(loc.DATA_SUBDIR['raw'], 'ch_x_exp.pickle'))

In [5]:
# df = cl_x_exp.dim_raw.sample(int(1e4))
df = Dataset(
    selector=1970,
    star=star
).read_dim(sample=0.05)

  meta = MetaData(self.connectable, schema=schema)


In [4]:
# df = cl_x_exp.dim_raw.sample(int(1e4))
df_lrg = Dataset(
    selector=1970,
    star=star
).read_dim()

  meta = MetaData(self.connectable, schema=schema)


In [6]:
df.sample(3)

Unnamed: 0,id,company_name,company_number,care_of,po_box,address_line_1,address_line_2,post_town,county,country,...,previous_name_7,previous_name_8_change_date,previous_name_8,previous_name_9_change_date,previous_name_9,previous_name_10_change_date,previous_name_10,conf_statement_next_due_date,conf_statement_last_made_up_date,publish_date
565,14509438,BROMPTON LODGE CARE LTD,14509438,,,132 BROMPTON LANE,,ROCHESTER,,ENGLAND,...,,,,,,,,11/12/2023,,2023-09-01
1896,08643687,LONGFORTH FARM MANAGEMENT COMPANY LIMITED,08643687,,,QUEENSWAY HOUSE,11 QUEENSWAY,NEW MILTON,HAMPSHIRE,ENGLAND,...,,,,,,,,05/08/2024,22/07/2023,2023-09-01
1061,NI691803,FAIRBURN FITNESS LTD,NI691803,,,"26 LINENHALL STREET, 1ST FLOOR",LINENHALL EXCHANGE,BELFAST,,NORTHERN IRELAND,...,,,,,,,,25/10/2023,,2023-09-01


## Unit test

Scratch for making one.

In [54]:
import ast

def load_test_data(path):
    dirty = pd.read_csv(
        Path(path, "dirty.csv"), 
        converters={"list": ast.literal_eval}
    )
    clean = pd.read_csv(
        Path(path, "clean.csv"), 
        converters={"list": ast.literal_eval}
    )
    dirty.columns = ["col"]
    clean.columns = ["col"]

    return dirty, clean

array_except_partial = partial(array_except, terms_to_remove=["ltd", "plc"])

dirty, clean = load_test_data(
    Path(loc.PROJECT_DIR, "test", "features", "expand_abbreviations")
)

In [66]:
def expand_abbreviations(input_column, replacements):
    """
    Expand abbreviations passed as a dictionary where the keys are matches
    and the values are what to replace them with.

    Matches only when term is surrounded by regex word boundaries.
    
    Arguments: 
        input_column: the name of the column to clean
        replacements: a dictionary where keys are matches and values are
        what the replace them with
        
    Returns: string to insert into SQL query
    """
    replace_stack = ""
    for i, (match, replacement) in enumerate(replacements.items()):
        if i == 0:        
            replace_stack = rf"""
                regexp_replace(
                    lower({input_column}),
                    '\b({match})\b',
                    '{replacement}',
                    'g'
                )
            """
        else:
            replace_stack = rf"""
                regexp_replace(
                    {replace_stack},
                    '\b({match})\b',
                    '{replacement}',
                    'g'
                )
            """
    
    return replace_stack

In [69]:
expand_abbreviations_partial = partial(
    expand_abbreviations,
    replacements = {
        "co": "company",
        "ltd": "limited",
        "baz": "bazinga"
    }
)

In [70]:
duckdb.sql(rf"""
    select
         {expand_abbreviations_partial("col")} as col
    from
        dirty
""")

┌─────────────────────────┐
│           col           │
│         varchar         │
├─────────────────────────┤
│ foo company             │
│ bar company inc         │
│ bazinga company company │
│ quxco                   │
│ quux limited company    │
│ ltdcorge                │
└─────────────────────────┘

In [60]:
duckdb.sql(rf"""
    select
        regexp_replace(
            lower("col"),
            '(co\s|co$)',
            'company ',
            'g'
        ) as col
    from
        dirty
""")

┌──────────────────────┐
│         col          │
│       varchar        │
├──────────────────────┤
│ foo company          │
│ bar company inc      │
│ baz company company  │
│ quxcompany           │
│ quux ltd company     │
│ ltdcorge             │
└──────────────────────┘

## Pipeline

Testing how we can make stuff using the duckdb factory, and therefore unit testing only the basic versions of functions.

In [5]:
df_lrg = duckdb.sql("""
    select
        company_name, 
        [company_name[:10], company_name[10:]] as secondary_names,
        company_number
    from
        df_lrg
""").df()

In [6]:
df_lrg.sample(5)

Unnamed: 0,company_name,secondary_names,company_number
3597888,P AND A HODGES LIMITED,"[P AND A HO, ODGES LIMITED]",7133996
1448594,DRIP N DRY LTD,"[DRIP N DRY, Y LTD]",13345001
750611,BRAMWELL BROWN LIMITED,"[BRAMWELL B, BROWN LIMITED]",8504514
4780078,THE GREEN ROOM BOUTIQUE LIMITED,"[THE GREEN , ROOM BOUTIQUE LIMITED]",13658823
1988230,GMTK MANAGEMENT LTD,"[GMTK MANAG, GEMENT LTD]",9662611


In [7]:
%%time
clean_comp_names(df_lrg, "company_name", "secondary_names")

CPU times: user 13min 43s, sys: 8.26 s, total: 13min 51s
Wall time: 6min 24s


Unnamed: 0,company_name,company_number,secondary_names
0,goberub,13404790,"[goberub l, None]"
1,nspired investments,SC606050,"[nspired i, investments]"
2,nvertd designs,09152972,"[nvertd de, esigns]"
3,yozo fass,02714021,"[yozo fass, s]"
4,bora 2,13220580,"[bora 2, 2]"
...,...,...,...
5393601,zeenu,14458541,"[zeenu limi, ited]"
5393602,zeeshan shafqat 799,14816987,"[zeeshan sh, hafqat 799]"
5393603,zeestar,12600587,"[zeestar li, imited]"
5393604,zeezo,14364849,"[zeezo limi, ited]"


In [32]:
unnest = duckdb.sql(f"""
select
    row_number() over () as nest_id,
    *
    replace (unnest(secondary_names) as secondary_names)
from
    df2;
""").df()
unnest.head(5)

Unnamed: 0,nest_id,company_name,secondary_names,company_number
0,1,5 DAY BLINDS LIMITED,5 DAY BLIN,8294716
1,1,5 DAY BLINDS LIMITED,NDS LIMITED,8294716
2,2,5 DE PARYS LTD,5 DE PARYS,8046339
3,2,5 DE PARYS LTD,S LTD,8046339
4,3,5 DE VERE GARDENS LTD,5 DE VERE,13930524


In [33]:
processed = clean_primary(unnest, "company_name")
processed.head(5)

Unnamed: 0,nest_id,company_name,secondary_names,company_number
0,1,5 day blinds,5 DAY BLIN,8294716
1,1,5 day blinds,NDS LIMITED,8294716
2,2,5 de parys,5 DE PARYS,8046339
3,2,5 de parys,S LTD,8046339
4,3,5 de vere gardens,5 DE VERE,13930524


In [60]:
", ".join([f"any_value({col})" for col in processed.columns if col != 'nest_id'])

'any_value(company_name), any_value(secondary_names), any_value(company_number)'

In [61]:
duckdb.sql(f"""
select
    any_value(company_name), 
    any_value(company_number),
    list(secondary_names) as secondary_names
from
    unnest
group by nest_id;
""")

┌──────────────────────────────────────┬───────────────────────────┬───────────────────────────────────────────────────┐
│       any_value(company_name)        │ any_value(company_number) │                  secondary_names                  │
│               varchar                │          varchar          │                     varchar[]                     │
├──────────────────────────────────────┼───────────────────────────┼───────────────────────────────────────────────────┤
│ 5 DAY BLINDS LIMITED                 │ 08294716                  │ [5 DAY BLIN, NDS LIMITED]                         │
│ 5 DE PARYS LTD                       │ 08046339                  │ [5 DE PARYS, S LTD]                               │
│ 5 DE VERE GARDENS LTD                │ 13930524                  │ [5 DE VERE ,  GARDENS LTD]                        │
│ 5 DE VERE GARDENS MANAGEMENT COMPA…  │ 02490721                  │ [5 DE VERE ,  GARDENS MANAGEMENT COMPANY LIMITED] │
│ 5 DEEP LIMITED                

In [37]:
renest = duckdb.sql(f"""
select
    *
    replace (list(secondary_names) as secondary_names)
from
    unnest
group by nest_id;
""").df()
renest.head(5)

ParserException: Parser Error: syntax error at or near "replace"
LINE 4:     replace (list(...
            ^

In [18]:
from functools import partial

remove_stopwords = partial(array_except, terms_to_remove=stopwords)

clean_primary = duckdb_cleaning_factory(
    [
        clean_company_name,
        remove_stopwords,
        list_join_to_string,
    ]
)
clean_secondary = unnest_renest(clean_primary)

In [19]:
clean_primary(df, "company_name").sample(3)

Unnamed: 0,id,company_name,company_number,care_of,po_box,address_line_1,address_line_2,post_town,county,country,...,previous_name_7,previous_name_8_change_date,previous_name_8,previous_name_9_change_date,previous_name_9,previous_name_10_change_date,previous_name_10,conf_statement_next_due_date,conf_statement_last_made_up_date,publish_date
1132,7661954,freddy foxtrots vintage emporium,7661954,,,18 TOP END,RENHOLD,BEDFORD,,ENGLAND,...,,,,,,,,22/06/2024,08/06/2023,2023-09-01
23,15023369,aaa gas engineers extensions,15023369,,,20 WENLOCK ROAD,,LONDON,,ENGLAND,...,,,,,,,,06/08/2024,,2023-09-01
1496,14512479,i security services i sec,14512479,,,46 HOUGHTON PLACE,,BRADFORD,WEST YORKSHIRE,UNITED KINGDOM,...,,,,,,,,12/12/2023,,2023-09-01


In [23]:
clean_secondary(df2, "secondary_names").sample(3)

Unnamed: 0,company_name,secondary_names,company_number
2783,SUNCH CONSULTING LTD,"[sunch cons, sulting]",14731169
1648,CUSTOM HOUSE FLATS MANAGEMENT COMPANY (ST IVES...,"[custom hou, use flats management st ives]",2547194
647,CORNELIUS CAPITAL LIMITED,"[cornelius, capital]",14645653


## Experiments

What does this function actually do?

* Standard clean of company name, returns tokens in an array
* Standard clean of an array of company's second names -- this as array of arrays, presumably
* Removes stopwords from the cleaned names
    * By joining in the stopwords to EVERY ROW
* Adds lists of terms removed etc (with pandas functions)

I think we can make it way more efficient by overwriting columns, keeping it in duckdb, and ditching columns that aren't needed in prod.

In [6]:
sec_df = duckdb.sql("""
    select
        *,
        [company_name, company_name] as secondary_names
    from
        df;
""").df()

In [10]:
def array_except(input_col_name, terms_to_remove):
    return rf"""
    array_filter(
        {input_col_name},
        x -> not array_contains({terms_to_remove}, x)
    )
    """

In [13]:
def array_except(input_col_name, terms_to_remove):
    return rf"""
    array_filter(
        {input_col_name},
        x -> not array_contains({terms_to_remove}, x)
    )
    """

def clean_comp_names(
    df, primary_col: str, secondary_col: str = None, stopwords: str = stopwords
):

    clean_and_stopwords_primary_sql = f"""
        select
            *
            replace (
                {list_join_to_string(
                    array_except(
                        clean_company_name(primary_col), 
                        stopwords
                    )
                )}
                as {primary_col}
            )
        from
            df;
    """
    
    if secondary_col is not None:
        unnest_sql = f"""
            select
                *
                replace (unnest({secondary_col}) as {secondary_col})
            from
                df;
        """
        clean_and_stopwords_secondary_sql = f"""
            select
                *
                replace (
                    {list_join_to_string(
                        array_except(
                            clean_company_name(secondary_col), 
                            stopwords
                        )
                    )}
                    as {secondary_col}
                )
            from
                df;
        """
        renest_sql = f"""
            select
                *
                replace (list({secondary_col}) as {secondary_col})
            from
                df
            group by all;
        """
        to_run = [
            unnest_sql, 
            clean_and_stopwords_secondary_sql,
            renest_sql,
            clean_and_stopwords_primary_sql
        ]
    else:
        to_run = [
            clean_and_stopwords_primary_sql
        ]

    for sql in to_run:
        df = duckdb.sql(sql).df()

    return df

In [15]:
clean_comp_names(
    df,
    primary_col="company_name",
    secondary_col=None,
    stopwords=stopwords
)

Unnamed: 0,id,company_name,postcode
0,1523028,gemini trading nottm,NG16 3SU
1,1029738,exel technology,NG18 5FU
2,898745,dominic schuster,GU21 2LX
3,2656450,ocompany tools,L33 7TW
4,3274294,montagne jeunesse,SA12 7AX
...,...,...,...
9995,1509290,radha supplies,SN4 0AW
9996,2423214,relay floor systems,WS13 6PY
9997,2011906,echo brand communications,BH21 7UH
9998,2857066,poclain hydraulics,PE8 4HN


In [16]:
clean_comp_names(
    sec_df,
    primary_col="company_name",
    secondary_col="secondary_names",
    stopwords=stopwords
)

Unnamed: 0,id,company_name,postcode,secondary_names
0,898745,dominic schuster,GU21 2LX,"[dominic schuster, dominic schuster]"
1,135087,partners design consultants,EC1M 6BM,"[partners design consultants, partners design ..."
2,133562,alliance wine,KA15 1LN,"[alliance wine, alliance wine]"
3,1770810,51parcel,E3 3QR,"[51parcel, 51parcel]"
4,2142513,babble cloud,EC3A 5AR,"[babble cloud, babble cloud]"
...,...,...,...,...
9995,2773496,am digital,WN6 9RD,"[am digital, am digital]"
9996,1970987,lewis antony richardcharles,NG18 4TW,"[lewis antony richardcharles, lewis antony ric..."
9997,983787,metocean telematics,PO15 7AB,"[metocean telematics, metocean telematics]"
9998,2689722,zakas dimitrios,AB10 1ZP,"[zakas dimitrios, zakas dimitrios]"


In [18]:
cl_df = clean_comp_names(
    cl_x_exp.cluster_raw,
    primary_col="company_name",
    secondary_col=None,
    stopwords=stopwords
)

In [8]:
unnest = duckdb.sql(f"""
    select
        *
        replace (unnest(secondary_names) as secondary_names)
    from
        sec_df;
""").df()
unnest.head(5)

Unnamed: 0,id,company_name,postcode,secondary_names
0,1523028,GEMINI TRADING (NOTTM) LIMITED,NG16 3SU,GEMINI TRADING (NOTTM) LIMITED
1,1523028,GEMINI TRADING (NOTTM) LIMITED,NG16 3SU,GEMINI TRADING (NOTTM) LIMITED
2,1029738,EXEL TECHNOLOGY GROUP LTD,NG18 5FU,EXEL TECHNOLOGY GROUP LTD
3,1029738,EXEL TECHNOLOGY GROUP LTD,NG18 5FU,EXEL TECHNOLOGY GROUP LTD
4,898745,DOMINIC SCHUSTER LIMITED,GU21 2LX,DOMINIC SCHUSTER LIMITED


In [11]:
clean_and_stopwords_secondary = duckdb.sql(f"""
    select
        *
        replace (
            {list_join_to_string(
                array_except(
                    clean_company_name('secondary_names'), 
                    stopwords
                )
            )}
            as secondary_names
        )
    from
        unnest;
""").df()
clean_and_stopwords_secondary.head(5)

Unnamed: 0,id,company_name,postcode,secondary_names
0,1523028,GEMINI TRADING (NOTTM) LIMITED,NG16 3SU,gemini trading nottm
1,1523028,GEMINI TRADING (NOTTM) LIMITED,NG16 3SU,gemini trading nottm
2,1029738,EXEL TECHNOLOGY GROUP LTD,NG18 5FU,exel technology
3,1029738,EXEL TECHNOLOGY GROUP LTD,NG18 5FU,exel technology
4,898745,DOMINIC SCHUSTER LIMITED,GU21 2LX,dominic schuster


In [12]:
renest = duckdb.sql(f"""
    select
        *
        replace (list(secondary_names) as secondary_names)
    from
        clean_and_stopwords_secondary
    group by all;
""").df()
renest.head(5)

Unnamed: 0,id,company_name,postcode,secondary_names
0,3274294,MONTAGNE JEUNESSE INTERNATIONAL LIMITED,SA12 7AX,"[montagne jeunesse, montagne jeunesse]"
1,2405408,ARMASHIELD LIMITED,PO7 7XJ,"[armashield, armashield]"
2,83891,MARINE AND CHARTER SOLUTIONS LLP,LL53 7AH,"[marine charter solutions, marine charter solu..."
3,1981031,TROUBADOR PUBLISHING LTD,LE8 0RX,"[troubador publishing, troubador publishing]"
4,2477306,J HEEBINK (MANCHESTER) LIMITED,M16 0RJ,"[j heebink manchester, j heebink manchester]"


In [9]:
primary_col = "company_name"
clean_primary_sql = f"""
    select
        *
        replace ({clean_company_name(primary_col)} as {primary_col})
    from
        to_process;
"""
stopwords_primary_sql = f"""
    select
        *
        replace (
            {list_join_to_string(
                array_except(
                    primary_col, 
                    stopwords
                )
            )}
            as {primary_col}
        )
    from
        to_process;
"""
to_do = [clean_primary_sql, stopwords_primary_sql]

In [None]:
to_process = df
for i in to_do:
    to_process = duckdb.sql(i)

In [17]:
x1 = duckdb.sql(f"""
    select
        *,
        {clean_company_name("company_name")} as name_clean,
        {array_except("name_clean", stopwords)} as name_without_stopwords,
        {list_join_to_string("name_without_stopwords")} as name_out
    from
        df;
""")

In [20]:
duckdb.sql(f"""
    select
        * 
        exclude(name_clean, name_without_stopwords, name_out)
        replace(name_out as company_name)
    from
        x1;
""")

┌─────────┬─────────────────────────────┬──────────┐
│   id    │        company_name         │ postcode │
│  int64  │           varchar           │ varchar  │
├─────────┼─────────────────────────────┼──────────┤
│  258194 │ shanti hospitality          │ SW1P 2PN │
│ 2090119 │ mywebtonet webhosting       │ PO18 8EN │
│ 1568046 │ medina spares               │ BB7 1QD  │
│ 2983001 │ astronova                   │ SL6 3RT  │
│  459540 │ progressive motorsport      │ NN13 7ES │
│ 3108194 │ soltechsupply               │ CV31 1LW │
│ 2572987 │ cwt commodity logistics     │ RM18 7EB │
│ 3362460 │ western air ducts           │ BA11 2FD │
│ 2461809 │ anglo italian enterprises   │ W1G 8NP  │
│ 1551069 │ meir australia              │ EC4V 4BE │
│    ·    │       ·                     │    ·     │
│    ·    │       ·                     │    ·     │
│    ·    │       ·                     │    ·     │
│ 3429276 │ towerbrook capital partners │ SW1Y 4AH │
│ 2782615 │ transportify                │ IP2 

In [39]:
sec_df2 = duckdb.sql(f"""
    select
        *
        replace (unnest(secondary_names) as secondary_names)
    from
        sec_df;
""")

In [40]:
sec_df3 = duckdb.sql(f"""
    select
        *
        replace ({clean_company_name("secondary_names")} as secondary_names)
    from
        sec_df2;
""")

In [43]:
sec_df4 = duckdb.sql(f"""
    select
        *
        replace (
            {
                list_join_to_string(
                    array_except("secondary_names", stopwords)
                )
            }
            as secondary_names
        )
    from
        sec_df3;
""")

In [50]:
sec_df5 = duckdb.sql(f"""
    select
        *
        replace (list(secondary_names) as secondary_names)
    from
        sec_df4
    group by all;
""")

In [5]:
def array_except(input_col_name, terms_to_remove):
    return rf"""
    array_filter(
        {input_col_name},
        x -> not array_contains({terms_to_remove}, x)
    )
    """

In [25]:
df2 = duckdb.sql(f"""
    select
        *
        replace ({clean_company_name("company_name")} as company_name)
    from
        df;
""")

In [11]:
df3 = duckdb.sql(f"""
    select
        *
        replace (
            {
                list_join_to_string(
                    array_except("company_name", stopwords)
                )
            }
            as company_name
        )
    from
        df2;
""")

In [9]:
df3

┌─────────┬──────────────────────────────────────────────┬──────────┐
│   id    │                 company_name                 │ postcode │
│  int64  │                   varchar                    │ varchar  │
├─────────┼──────────────────────────────────────────────┼──────────┤
│ 2720694 │ lloyd julian                                 │ NR6 7GA  │
│  647217 │ niels larsen                                 │ WF5 0HP  │
│  505204 │ churchill fire                               │ EC2A 3QR │
│  618395 │ buzz pinky                                   │ PO9 2NA  │
│ 3361781 │ t f tull                                     │ WD18 8RH │
│  650314 │ vct                                          │ GU24 8HU │
│ 2310276 │ showerdrape std                              │ M17 1DB  │
│  249534 │ maquet                                       │ NE35 9PZ │
│ 2321202 │ fiera capital iom                            │ IM1 1EU  │
│ 2893212 │ nature s buddy                               │ SW17 0QF │
│    ·    │       · 

In [None]:
sql_clean_company_name = f"""
    select
        {clean_company_name(primary_col)} as company_name_arr,
        {
            f"{clean_company_name(secondary_col)} as secondary_names_arr, "
            if secondary_col
            else ""
        }
        *
    from df
"""
names_cleaned = duckdb.sql(sql_clean_company_name) 

In [None]:
"function": clean_comp_names,
"arguments": {
    "primary_col": "company_name",
    "secondary_col": None,
    "stopwords": stopwords,
},

In [None]:
"function": clean_comp_names,
"arguments": {
    "primary_col": "company_name",
    "secondary_col": None,
    "stopwords": stopwords,
},

In [None]:
def clean_comp_names(
    df, primary_col: str, secondary_col: str = None, stopwords: str = stopwords
):
    """
    Lower case, remove punctuation & tokenise the primary company name into an array.
    Extract tokens into: 'unusual' and 'stopwords'. Dedupe. Sort alphabetically.
    Untokenise the unusual words back to a string.

    Args:
        df: a dataframe
        primary_col: a column containing the company's main name
        secondary_col: a column containing an array of the company's
            secondary names
        stopwords: a list of stopwords to use for this clean
    Returns:
        dataframe: company number, 'unusual' tokens', most common 3 tokens,
            most common 4 to 6 tokens, list of previous names of company, postcode.
    """

    # TODO: Refactor the silly nested f-strings

    # CLEAN and TOKENISE
    # To a new dataframe
    sql_clean_company_name = f"""
    select
        {clean_company_name(primary_col)} as company_name_arr,
        {
            f"{clean_company_name(secondary_col)} as secondary_names_arr, "
            if secondary_col
            else ""
        }
        *
    from df
    """
    names_cleaned = duckdb.sql(sql_clean_company_name)  # noqa:F841

    # Define STOPWORDS
    # And join them in
    stopword_tokens = pd.DataFrame({"token_array": [stopwords]})  # noqa:F841
    sql_companies_arr_with_top = """
    select
        *,
        (select * from stopword_tokens) as stopwords
    from names_cleaned
    """
    with_common_terms = duckdb.sql(sql_companies_arr_with_top)  # noqa:F841

    # EXTRACT the UNUSUAL and STOPWORD tokens
    # We want the weird stuff from company names
    # TODO: leave name_unusual_tokens (and secondary...) as array & remove split() below
    def secondary_name_unusual_tokens():
        # DuckDB needs a refactor, sorry
        return list_join_to_string(array_except("secondary_names_arr", "stopwords"))

    def cat_names_tokens_stopwords(primary_arr, secondary_arr, stopwords):
        # DuckDB needs a refactor, sorry
        # return array_intersect("secondary_names_arr", "stopwords")
        primary = rf"{array_intersect(primary_arr, stopwords)}"
        secondary = rf"{array_intersect(primary_arr, stopwords)}"

        if secondary_arr:
            return rf"""
                array_cat(
                    {primary},
                    {secondary}
                )
            """
        else:
            return rf"{primary}"

    sql_manipulate_arrays = f"""
    select
        *,
        {
            list_join_to_string(
                array_except("company_name_arr", "stopwords")
            )
        }
            as name_unusual_tokens,
        {
            (
                f"{secondary_name_unusual_tokens()} "
                "as secondary_name_unusual_tokens"
            )
            if secondary_col
            else ""
        }
        {
            cat_names_tokens_stopwords(
                "company_name_arr",
                "secondary_names_arr",
                stopwords
            )
        } as names_tokens_stopwords
    from with_common_terms
    """
    clean = duckdb.sql(sql_manipulate_arrays)

    clean_df = clean.df()

    # DEDUPE names_tokens_stopwords
    clean_df["name_unusual_tokens"] = clean_df.name_unusual_tokens.apply(
        lambda x: " ".join(sorted(set(x.split()))) if pd.notnull(x) else x
    )
    if secondary_col:
        clean_df[
            "secondary_name_unusual_tokens"
        ] = clean_df.secondary_name_unusual_tokens.apply(
            lambda x: " ".join(sorted(set(x.split()))) if pd.notnull(x) else x
        )

    clean_df["names_tokens_stopwords"] = clean_df.names_tokens_stopwords.apply(
        lambda x: " ".join(set(x))
    )

    # Get HEAD and TAIL characters
    # For blocking rules
    clean_df["name_unusual_tokens_first5"] = clean_df.name_unusual_tokens.str[:5]
    clean_df["name_unusual_tokens_last5"] = clean_df.name_unusual_tokens.str[-5:]

    return clean_df