In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# Deterministic linker

A place to fix and test the deterministic linker.

In [160]:
from src import locations as loc
from src.data import utils as du
from src.data.star import Star
from src.data.datasets import Dataset
from src.data.probabilities import Probabilities
from src.data.clusters import Clusters
from src.data.validation import Validation
from src.link.deterministic_linker import DeterministicLinker
from src.features.clean_complex import duckdb_cleaning_factory
from src.features.clean_basic_original import (
    cms_original_clean_company_name_general,
    cms_original_clean_company_name_ch,
    cms_original_clean_postcode,
    cms_original_clean_email,
    cms_original_clean_ch_id,
    cms_original_clean_cdms_id
)

from dotenv import load_dotenv, find_dotenv
import os
import duckdb
from pathlib import Path

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

True

In [110]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)
validation = Validation(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("VALIDATE_TABLE")
)

In [161]:
cl_x_exp = DeterministicLinker(
    name="n1_deterministic_basic",
    dataset = Dataset(
        star_id=54717,
        star=star
    ), 
    probabilities=probabilities, 
    clusters=clusters, 
    n=1,
    overwrite=True
)

In [165]:
cl_x_exp.get_data(
    # sample=5,
    cluster_select={
        '"companieshouse"."companies"': [
            "company_name as company_name",
            "postcode as postcode"
        ]
    },
    dim_select=[
        "id",
        "company_name",
        "postcode"
    ],
)

In [163]:
clean_postcode = duckdb_cleaning_factory(cms_original_clean_postcode)
clean_ch_name = duckdb_cleaning_factory(cms_original_clean_company_name_ch)
clean_gen_name = duckdb_cleaning_factory(cms_original_clean_company_name_general)

cluster_pipeline={
    "clean_ch_comp_names": {
        "function": clean_ch_name,
        "arguments": {
            "column": "company_name"
        },
    },
    "clean_postcode": {
        "function": clean_postcode,
        "arguments": {
            "column": "postcode"
        },
    }
}
dim_pipeline={
    "clean__comp_names": {
        "function": clean_gen_name,
        "arguments": {
            "column": "company_name"
        },
    },
    "clean_postcode": {
        "function": clean_postcode,
        "arguments": {
            "column": "postcode"
        },
    }
}
link_settings={
    "company_name": "company_name",
    "postcode": "postcode"
}

## Full evaluation

In [166]:
cl_x_exp.evaluate(
    link_experiment="cm_hmrc-trade-exporters",
    evaluation_description="""
        - Deterministic name/postcode
        - Cleaned name as per existing CMS rules
    """,
    prepare_kwargs={
        "cluster_pipeline": cluster_pipeline,
        "dim_pipeline": dim_pipeline,
        "link_settings": link_settings
    },
    link_kwargs={},
    report_dir=Path(
        loc.PROJECT_DIR, 
        'scratch', 
        'reports', 
        'cm_hmrc-trade-exporters',
        'deterministic'
    ),
    log_mlflow=True,
    log_output=True,
)

INFO:src.link.linker:Running pipeline
INFO:src.link.linker:Logging outputs to the Probabilities table
INFO:src.link.linker:Logging as MLflow experiment
DEBUG:urllib3.connectionpool:Resetting dropped connection: mlflow--data-science.data.trade.gov.uk
DEBUG:urllib3.connectionpool:http://mlflow--data-science.data.trade.gov.uk:8004 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=cm_hmrc-trade-exporters HTTP/1.1" 200 245
DEBUG:urllib3.connectionpool:Resetting dropped connection: mlflow--data-science.data.trade.gov.uk
DEBUG:urllib3.connectionpool:http://mlflow--data-science.data.trade.gov.uk:8004 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=cm_hmrc-trade-exporters HTTP/1.1" 200 245
DEBUG:urllib3.connectionpool:Resetting dropped connection: mlflow--data-science.data.trade.gov.uk
DEBUG:urllib3.connectionpool:http://mlflow--data-science.data.trade.gov.uk:8004 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 1095
DEBUG:urllib3.connectionpool:Resetting dropped connecti

## Prepare data

In [148]:
cl_x_exp.prepare(
    cluster_pipeline=cluster_pipeline,
    dim_pipeline=dim_pipeline,
    link_settings=link_settings
)

In [149]:
cl_x_exp.dim_processed.head(5)
cl_x_exp.cluster_processed.head(5)

Unnamed: 0,id,company_name,postcode
0,1189194,1stcalllockkeyshop,b302bt
1,3490615,1stcallmobility,lu55xf
2,2303259,1stcallmobility,lu55xf
3,571726,1stcallmobility,cm195ar
4,343700,1stcarimports,dn91hs


Unnamed: 0,id,company_name,postcode
0,e4607fd5-11d2-4746-b850-480808029c4f,itresident,e20lt
1,3c4498ec-6525-405e-b057-c0ab7182268d,klinelngshippinguk,ec2v7bp
2,b96f4500-7b62-4fcb-8df4-edb978a80632,mnapropertydundee,dd54ra
3,0b305d2d-dc85-462a-a860-a5b95bfce4ef,phoenixmanagement,en48re
4,f51d7369-2fa3-4789-993e-647c3eb80c24,step13transport,b170nl


## Link data

In [150]:
cl_x_exp.link(
    log_output=True
)

Unnamed: 0,cluster,id,source,probability,uuid,link_type,model
0,1d7f4b26-189a-4a2b-9b8b-489baeffabfb,2276819,54717,1,75ea98b2-c13e-479f-a9f1-817f48956ba3,link,n1_deterministic_basic
1,690b86e1-9012-49da-99a8-ecd08a1a7e10,2286438,54717,1,23af35aa-fb04-4576-9458-455061950618,link,n1_deterministic_basic
2,4a64ee28-e3dc-4128-9642-24a22f678495,872651,54717,1,622522dc-e2c3-412b-8e7f-0d8f705d0599,link,n1_deterministic_basic
3,272d50a7-5304-4d2f-b6aa-f52549e940e1,3517005,54717,1,b626b7d1-a402-4b89-846e-b3abf9cc954b,link,n1_deterministic_basic
4,b3082628-24c5-4f1f-a0bb-95bc20361271,2719390,54717,1,6e32b13a-301c-4ddb-8ce8-8bfd4db04ebb,link,n1_deterministic_basic
...,...,...,...,...,...,...,...
197,816d2e0c-e25e-4576-a36e-6771dcd63475,3059580,54717,1,a4f2ed80-a0c7-4ddb-85bc-8532c773cb55,link,n1_deterministic_basic
198,b9558b7c-cb05-424b-b2a7-32df83ff0415,59699,54717,1,2cedbbc6-fc4f-410e-8940-14e22900c57b,link,n1_deterministic_basic
199,8cdfc90d-ff6b-4281-8dd7-b601f137f991,3205740,54717,1,13be15b6-efe4-414b-a47a-6d54b0973e8b,link,n1_deterministic_basic
200,6ab6d3ec-dbc0-4083-b1c3-c84eeb9e6f7d,2423854,54717,1,2640b8fb-eb02-499b-aacd-233547a14fce,link,n1_deterministic_basic


In [151]:
x = probabilities.read()
x.model.value_counts()

exp_n2_splink_basic       164269
n1_deterministic_basic       202
Name: model, dtype: int64

In [170]:
x.model.nunique()

2