In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# ExistingCMSPlus linker

A place to fix and test the existing CMS+ linker.

In [2]:
from cmf import locations as loc
from cmf.data import utils as du
from cmf.data.star import Star
from cmf.data.datasets import Dataset
from cmf.data.probabilities import Probabilities
from cmf.data.clusters import Clusters
from cmf.data.validation import Validation
from cmf.link.existingservice_linker import ExistingCMSPlusLinker
from cmf.features.clean_complex import duckdb_cleaning_factory
from cmf.features.clean_basic_original import (
    cms_original_clean_company_name_general,
    cms_original_clean_company_name_ch,
    cms_original_clean_postcode,
    cms_original_clean_email,
    cms_original_clean_ch_id,
    cms_original_clean_cdms_id
)

from dotenv import load_dotenv, find_dotenv
import os
import duckdb
from pathlib import Path
import pandas as pd

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/jovyan/company-matching/notebooks, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/jovyan/company-matching/notebooks, universal_newlines=False, shell=None, istream=None)


True

In [3]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)
validation = Validation(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("VALIDATE_TABLE")
)

In [4]:
cl_x_exp = ExistingCMSPlusLinker(
    name="n1_cms_basic",
    dataset = Dataset(
        star_id=54717,
        star=star
    ), 
    probabilities=probabilities, 
    clusters=clusters, 
    n=1,
    overwrite=True
)

In [5]:
cl_x_exp.get_data(
    # sample=5,
    cluster_select={
        '"companieshouse"."companies"': [
            "company_name as company_name",
            "postcode as postcode"
        ]
    },
    dim_select=[
        "id",
        "company_name",
        "postcode"
    ],
)

In [6]:
clean_postcode = duckdb_cleaning_factory(cms_original_clean_postcode)
clean_ch_name = duckdb_cleaning_factory(cms_original_clean_company_name_ch)
clean_gen_name = duckdb_cleaning_factory(cms_original_clean_company_name_general)

cluster_pipeline={
    "clean_ch_comp_names": {
        "function": clean_ch_name,
        "arguments": {
            "column": "company_name"
        },
    },
    "clean_postcode": {
        "function": clean_postcode,
        "arguments": {
            "column": "postcode"
        },
    }
}
dim_pipeline={
    "clean__comp_names": {
        "function": clean_gen_name,
        "arguments": {
            "column": "company_name"
        },
    },
    "clean_postcode": {
        "function": clean_postcode,
        "arguments": {
            "column": "postcode"
        },
    }
}
link_settings={
    "company_name": {
        "cluster": "company_name",
        "dimension": "company_name",
        "weight": 2
    },
    "postcode": {
        "cluster": "postcode",
        "dimension": "postcode",
        "weight": 1
    }
}

## Full evaluation

In [7]:
cl_x_exp.evaluate(
    link_experiment="cm_hmrc-trade-exporters",
    evaluation_description="""
        - Existing CMS for name/postcode
        - Cleaned name as per existing CMS rules
        - Name match double weighted
    """,
    prepare_kwargs={
        "cluster_pipeline": cluster_pipeline,
        "dim_pipeline": dim_pipeline,
        "link_settings": link_settings
    },
    link_kwargs={
        "threshold": 0.5
    },
    report_dir=Path(
        loc.PROJECT_DIR, 
        'scratch', 
        'reports', 
        'cm_hmrc-trade-exporters',
        'existing'
    ),
    log_mlflow=True,
    log_output=True,
)

INFO:src.link.linker:Running pipeline
INFO:src.link.linker:Logging outputs to the Probabilities table
INFO:src.link.linker:Logging as MLflow experiment
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): mlflow--data-science.data.trade.gov.uk:8004
DEBUG:urllib3.connectionpool:http://mlflow--data-science.data.trade.gov.uk:8004 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=cm_hmrc-trade-exporters HTTP/1.1" 200 245
DEBUG:urllib3.connectionpool:Resetting dropped connection: mlflow--data-science.data.trade.gov.uk
DEBUG:urllib3.connectionpool:http://mlflow--data-science.data.trade.gov.uk:8004 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=cm_hmrc-trade-exporters HTTP/1.1" 200 245
DEBUG:git.util:Failed checking if running in CYGWIN due to: FileNotFoundError(2, 'No such file or directory')
DEBUG:urllib3.connectionpool:Resetting dropped connection: mlflow--data-science.data.trade.gov.uk
DEBUG:urllib3.connectionpool:http://mlflow--data-science.data.trade.g

## Prepare data

In [46]:
cl_x_exp.prepare(
    cluster_pipeline=cluster_pipeline,
    dim_pipeline=dim_pipeline,
    link_settings=link_settings
)

In [47]:
cl_x_exp.dim_processed.head(5)
cl_x_exp.cluster_processed.head(5)

Unnamed: 0,id,company_name,postcode
0,2937257,194localvtg,e16sa
1,2124828,1953,ec1y8jl
2,2570378,1953,sw32er
3,1537512,195mphlmited,s92tj
4,3359348,195mphlmited,dn227wf


Unnamed: 0,id,company_name,postcode
0,09bc545d-236a-4d15-9073-49ddabdc4b50,109uptonroadmanagement,ca89lz
1,0d4d985a-c15b-43e2-8142-c8f868025e1a,1093edin,g403px
2,cb9118f3-556c-4b1d-b3b5-ceaaa29993b8,1094874,s11wf
3,1016c07d-2330-4076-ab7d-21ab6900a431,109london,ss11eg
4,c0e587ab-4b94-4c70-bcd0-b57b8c64ca21,1010games,bl14qr


## Link data

In [48]:
cl_x_exp.link(
    log_output=True
)

Unnamed: 0,cluster,id,source,probability,uuid,link_type,model
0,35f7dbe2-2eb1-4df7-b50f-041cbc5d246e,1077171,54717,0.333333,029e3732-4154-4cb2-98b4-8e64e1196c31,link,n1_cms_basic
1,f33a7ab9-85b2-482e-8c02-5ca791ba76fc,1077171,54717,0.333333,a9ed34c6-6018-496f-9617-3c62ac0b36cd,link,n1_cms_basic
2,524e2a90-bc77-4b3d-992b-53ecd6dbc792,1077171,54717,0.333333,0b6f4cf7-aecc-4f90-93ed-0bd7e0e21922,link,n1_cms_basic
3,b2c53705-e74c-47ef-9b32-94cb941ba393,1077171,54717,0.333333,4910d8b9-ee29-4a9a-a873-a2135c27d3f0,link,n1_cms_basic
4,e9c1fd4e-b2ec-489a-8c94-5cdcc7a7ac1e,1077171,54717,0.333333,f930bf59-dea7-4bb3-afb3-f0fd72a244f8,link,n1_cms_basic
...,...,...,...,...,...,...,...
245554,0e90228b-bde8-4f8b-a1db-cdbf233475a7,2260542,54717,0.333333,03a1ff06-dda2-4c8f-8696-bf62218fdf70,link,n1_cms_basic
245555,26383f41-63d7-400d-97c0-3703b14584c5,2480128,54717,0.333333,a6dbf884-a09b-46d1-9cc4-64f3b587c241,link,n1_cms_basic
245556,8f6d6a90-a62e-4fc3-bcd1-cc6832c6f218,2480128,54717,0.333333,d6f48469-11d3-4765-9e0d-852323f82c6e,link,n1_cms_basic
245557,261972d3-5c94-4e03-9ff9-e00d4f4099c6,3056055,54717,0.333333,acd959d2-0f04-4084-adc9-93a363bb2d24,link,n1_cms_basic
