In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# 🧹Cleaning cleaning functions

The company name cleaning function I've been working with explodes to 30GB in memory. It seriously shouldn't. Worth a refactor.

In [2]:
from src import locations as loc
from src.data import utils as du
from src.data.star import Star
from src.data.datasets import Dataset
from src.data.probabilities import Probabilities
from src.data.clusters import Clusters
from src.link.splink_linker import SplinkLinker
from src.config import link_pipeline, stopwords
from src.features.clean_complex import clean_comp_names
from src.features.clean_basic import clean_company_name, list_join_to_string

import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

from dotenv import load_dotenv, find_dotenv
from pathlib import Path
import os
import duckdb

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/jovyan/company-matching/notebooks, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/jovyan/company-matching/notebooks, universal_newlines=False, shell=None, istream=None)


True

## Setup

Grab some data.

In [3]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)
cl_x_exp=SplinkLinker.load(
    path=Path(loc.DATA_SUBDIR['raw'], 'ch_x_exp.pickle')
)

In [None]:
cl_x_exp = SplinkLinker(
    dataset = Dataset(
        star_id=54717,
        star=star
    ), 
    probabilities=probabilities, 
    clusters=clusters, 
    n=2
)
cl_x_exp.get_data(
    cluster_select={
        '"companieshouse"."companies"': [
            "company_name as company_name",
            "postcode as postcode"
        ]
    },
    dim_select=[
        "id",
        "company_name",
        "postcode"
    ]
)
cl_x_exp.save(path=Path(loc.DATA_SUBDIR['raw'], 'ch_x_exp.pickle'))

## Experiments

In [4]:
df = cl_x_exp.dim_raw.sample(int(1e4))

What does this function actually do?

* Standard clean of company name, returns tokens in an array
* Standard clean of an array of company's second names -- this as array of arrays, presumably
* Removes stopwords from the cleaned names
    * By joining in the stopwords to EVERY ROW
* Adds lists of terms removed etc (with pandas functions)

I think we can make it way more efficient by overwriting columns, keeping it in duckdb, and ditching columns that aren't needed in prod.

In [5]:
sec_df = duckdb.sql("""
    select
        *,
        [company_name, company_name] as secondary_names
    from
        df;
""")

In [6]:
def array_except(input_col_name, terms_to_remove):
    return rf"""
    array_filter(
        {input_col_name},
        x -> not array_contains({terms_to_remove}, x)
    )
    """

In [7]:
def clean_comp_names(
    df, primary_col: str, secondary_col: str = None, stopwords: str = stopwords
):
    clean_primary_sql = f"""
        select
            *
            replace ({clean_company_name(primary_col)} as {primary_col})
        from
            df;
    """
    stopwords_primary_sql = f"""
        select
            *
            replace (
                {list_join_to_string(
                    array_except(primary_col, stopwords)
                )}
                as {primary_col}
            )
        from
            df;
    """
    
    if secondary_col is not None:
        unnest_sql = f"""
            select
                *
                replace (unnest({secondary_col}) as {secondary_col})
            from
                df;
        """
        clean_secondary_sql = f"""
            select
                *
                replace ({clean_company_name(secondary_col)} as {secondary_col})
            from
                df;
        """
        stopwords_secondary_sql = f"""
            select
                *
                replace (
                    {list_join_to_string(
                        array_except(secondary_col, stopwords)
                    )}
                    as {secondary_col}
                )
            from
                df;
        """
        renest_sql = f"""
            select
                *
                replace (list({secondary_col}) as {secondary_col})
            from
                df
            group by all;
        """
        to_run = [
            unnest_sql, 
            clean_secondary_sql, 
            stopwords_secondary_sql, 
            renest_sql,
            clean_primary_sql,
            stopwords_primary_sql
        ]
    else:
        to_run = [
            clean_primary_sql,
            stopwords_primary_sql
        ]

    for sql in to_run:
        df = duckdb.sql(sql)

    return df

TODO: this gives kernel death with no reason why. Look into it

In [None]:
clean_comp_names(
    df,
    primary_col="company_name",
    secondary_col=None,
    stopwords=stopwords
)

In [8]:
df

Unnamed: 0,id,company_name,postcode
177873,2657918,PORTOBELLO FASHION LTD,NW10 7LF
126049,677887,KINGSWOOD FRAMES AND MIRRORS/FRAMEWORK STUDIO LTD,SY21 8JF
58474,3155979,DAVID JOHN WRIGHT,SA18 3LF
21351,1049549,A V ENGINEERING SERVICES LTD,SG8 6DN
193354,2233514,ROHM (GREAT BRITAIN) LIMITED,KT2 6HH
...,...,...,...
146534,2017015,METALLIMONSTERS LTD,HU16 5DL
250358,1248660,WILLIAM GIBBONS AND SONS LIMITED,WV13 3XT
138536,3339364,MAF S.R.L. UNIPERSONALE,AB10 1ZP
204664,2621858,SIDDEQ AHMED MOHAMMED,LE1 2LT


In [9]:
primary_col = "company_name"
clean_primary_sql = f"""
    select
        *
        replace ({clean_company_name(primary_col)} as {primary_col})
    from
        df;
"""
stopwords_primary_sql = f"""
    select
        *
        replace (
            {list_join_to_string(
                array_except(primary_col, stopwords)
            )}
            as {primary_col}
        )
    from
        df;
"""

In [10]:
duckdb.sql(clean_primary_sql)

┌─────────┬───────────────────────────────────────────────────────────────┬──────────┐
│   id    │                         company_name                          │ postcode │
│  int64  │                           varchar[]                           │ varchar  │
├─────────┼───────────────────────────────────────────────────────────────┼──────────┤
│ 2657918 │ [portobello, fashion, limited]                                │ NW10 7LF │
│  677887 │ [kingswood, frames, and, mirrors, framework, studio, limited] │ SY21 8JF │
│ 3155979 │ [david, john, wright]                                         │ SA18 3LF │
│ 1049549 │ [a, v, engineering, services, limited]                        │ SG8 6DN  │
│ 2233514 │ [rohm, great, britain, limited]                               │ KT2 6HH  │
│ 2564275 │ [alan, jamieson, site, services, limited]                     │ BR6 6HR  │
│ 1135889 │ [studio, wayne, mcgregor, limited]                            │ E15 2GW  │
│ 1919098 │ [p, k, veneering, limited]     

In [11]:
duckdb.sql(stopwords_primary_sql)

BinderException: Binder Error:  Invalid LIST argument to array_filter!

In [39]:
sec_df2 = duckdb.sql(f"""
    select
        *
        replace (unnest(secondary_names) as secondary_names)
    from
        sec_df;
""")

In [40]:
sec_df3 = duckdb.sql(f"""
    select
        *
        replace ({clean_company_name("secondary_names")} as secondary_names)
    from
        sec_df2;
""")

In [43]:
sec_df4 = duckdb.sql(f"""
    select
        *
        replace (
            {
                list_join_to_string(
                    array_except("secondary_names", stopwords)
                )
            }
            as secondary_names
        )
    from
        sec_df3;
""")

In [50]:
sec_df5 = duckdb.sql(f"""
    select
        *
        replace (list(secondary_names) as secondary_names)
    from
        sec_df4
    group by all;
""")

In [5]:
def array_except(input_col_name, terms_to_remove):
    return rf"""
    array_filter(
        {input_col_name},
        x -> not array_contains({terms_to_remove}, x)
    )
    """

In [25]:
df2 = duckdb.sql(f"""
    select
        *
        replace ({clean_company_name("company_name")} as company_name)
    from
        df;
""")

In [11]:
df3 = duckdb.sql(f"""
    select
        *
        replace (
            {
                list_join_to_string(
                    array_except("company_name", stopwords)
                )
            }
            as company_name
        )
    from
        df2;
""")

In [9]:
df3

┌─────────┬──────────────────────────────────────────────┬──────────┐
│   id    │                 company_name                 │ postcode │
│  int64  │                   varchar                    │ varchar  │
├─────────┼──────────────────────────────────────────────┼──────────┤
│ 2720694 │ lloyd julian                                 │ NR6 7GA  │
│  647217 │ niels larsen                                 │ WF5 0HP  │
│  505204 │ churchill fire                               │ EC2A 3QR │
│  618395 │ buzz pinky                                   │ PO9 2NA  │
│ 3361781 │ t f tull                                     │ WD18 8RH │
│  650314 │ vct                                          │ GU24 8HU │
│ 2310276 │ showerdrape std                              │ M17 1DB  │
│  249534 │ maquet                                       │ NE35 9PZ │
│ 2321202 │ fiera capital iom                            │ IM1 1EU  │
│ 2893212 │ nature s buddy                               │ SW17 0QF │
│    ·    │       · 

In [None]:
sql_clean_company_name = f"""
    select
        {clean_company_name(primary_col)} as company_name_arr,
        {
            f"{clean_company_name(secondary_col)} as secondary_names_arr, "
            if secondary_col
            else ""
        }
        *
    from df
"""
names_cleaned = duckdb.sql(sql_clean_company_name) 

In [None]:
"function": clean_comp_names,
"arguments": {
    "primary_col": "company_name",
    "secondary_col": None,
    "stopwords": stopwords,
},

In [None]:
"function": clean_comp_names,
"arguments": {
    "primary_col": "company_name",
    "secondary_col": None,
    "stopwords": stopwords,
},

In [None]:
def clean_comp_names(
    df, primary_col: str, secondary_col: str = None, stopwords: str = stopwords
):
    """
    Lower case, remove punctuation & tokenise the primary company name into an array.
    Extract tokens into: 'unusual' and 'stopwords'. Dedupe. Sort alphabetically.
    Untokenise the unusual words back to a string.

    Args:
        df: a dataframe
        primary_col: a column containing the company's main name
        secondary_col: a column containing an array of the company's
            secondary names
        stopwords: a list of stopwords to use for this clean
    Returns:
        dataframe: company number, 'unusual' tokens', most common 3 tokens,
            most common 4 to 6 tokens, list of previous names of company, postcode.
    """

    # TODO: Refactor the silly nested f-strings

    # CLEAN and TOKENISE
    # To a new dataframe
    sql_clean_company_name = f"""
    select
        {clean_company_name(primary_col)} as company_name_arr,
        {
            f"{clean_company_name(secondary_col)} as secondary_names_arr, "
            if secondary_col
            else ""
        }
        *
    from df
    """
    names_cleaned = duckdb.sql(sql_clean_company_name)  # noqa:F841

    # Define STOPWORDS
    # And join them in
    stopword_tokens = pd.DataFrame({"token_array": [stopwords]})  # noqa:F841
    sql_companies_arr_with_top = """
    select
        *,
        (select * from stopword_tokens) as stopwords
    from names_cleaned
    """
    with_common_terms = duckdb.sql(sql_companies_arr_with_top)  # noqa:F841

    # EXTRACT the UNUSUAL and STOPWORD tokens
    # We want the weird stuff from company names
    # TODO: leave name_unusual_tokens (and secondary...) as array & remove split() below
    def secondary_name_unusual_tokens():
        # DuckDB needs a refactor, sorry
        return list_join_to_string(array_except("secondary_names_arr", "stopwords"))

    def cat_names_tokens_stopwords(primary_arr, secondary_arr, stopwords):
        # DuckDB needs a refactor, sorry
        # return array_intersect("secondary_names_arr", "stopwords")
        primary = rf"{array_intersect(primary_arr, stopwords)}"
        secondary = rf"{array_intersect(primary_arr, stopwords)}"

        if secondary_arr:
            return rf"""
                array_cat(
                    {primary},
                    {secondary}
                )
            """
        else:
            return rf"{primary}"

    sql_manipulate_arrays = f"""
    select
        *,
        {
            list_join_to_string(
                array_except("company_name_arr", "stopwords")
            )
        }
            as name_unusual_tokens,
        {
            (
                f"{secondary_name_unusual_tokens()} "
                "as secondary_name_unusual_tokens"
            )
            if secondary_col
            else ""
        }
        {
            cat_names_tokens_stopwords(
                "company_name_arr",
                "secondary_names_arr",
                stopwords
            )
        } as names_tokens_stopwords
    from with_common_terms
    """
    clean = duckdb.sql(sql_manipulate_arrays)

    clean_df = clean.df()

    # DEDUPE names_tokens_stopwords
    clean_df["name_unusual_tokens"] = clean_df.name_unusual_tokens.apply(
        lambda x: " ".join(sorted(set(x.split()))) if pd.notnull(x) else x
    )
    if secondary_col:
        clean_df[
            "secondary_name_unusual_tokens"
        ] = clean_df.secondary_name_unusual_tokens.apply(
            lambda x: " ".join(sorted(set(x.split()))) if pd.notnull(x) else x
        )

    clean_df["names_tokens_stopwords"] = clean_df.names_tokens_stopwords.apply(
        lambda x: " ".join(set(x))
    )

    # Get HEAD and TAIL characters
    # For blocking rules
    clean_df["name_unusual_tokens_first5"] = clean_df.name_unusual_tokens.str[:5]
    clean_df["name_unusual_tokens_last5"] = clean_df.name_unusual_tokens.str[-5:]

    return clean_df