In [1]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import IFrame, display
InteractiveShell.ast_node_interactivity = "all"

# 🔌Hybrid additive playground

Just a place to get linkers running.

In [2]:
from src import locations as loc
from src.data import utils as du
from src.data.star import Star
from src.data.datasets import Dataset
from src.data.probabilities import Probabilities
from src.data.clusters import Clusters
from src.link.splink_linker import SplinkLinker
from src.config import link_pipeline, stopwords
from src.features.clean_complex import clean_comp_names

from splink.duckdb.linker import DuckDBLinker
import splink.duckdb.comparison_library as cl
import splink.duckdb.comparison_template_library as ctl

import uuid
import types
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
import os
import io
import pandas as pd
import duckdb
import json

dotenv_path = find_dotenv()
load_dotenv(dotenv_path)

DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/jovyan/company-matching/notebooks, universal_newlines=False, shell=None, istream=None)
DEBUG:git.cmd:Popen(['git', 'version'], cwd=/home/jovyan/company-matching/notebooks, universal_newlines=False, shell=None, istream=None)


True

## Setup

In [3]:
star = Star(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("STAR_TABLE")
)
probabilities = Probabilities(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("PROBABILITIES_TABLE"),
    star = star
)
clusters = Clusters(
    schema = os.getenv("SCHEMA"),
    table = os.getenv("CLUSTERS_TABLE"),
    star = star
)

In [4]:
# probabilities.create(overwrite=True)
# clusters.create(dim=1970, overwrite=False)

In [5]:
cl_x_exp=SplinkLinker.load(
    path=Path(loc.DATA_SUBDIR['raw'], 'ch_x_exp.pickle')
)
cl_x_exp.dim_raw.shape
cl_x_exp.dim_processed.shape
cl_x_exp.cluster_raw.shape
cl_x_exp.cluster_processed.shape

(256604, 3)

(256604, 3)

(10774831, 3)

(10774831, 3)

## Splink

TODO:

* Write the `linker.link` method
* Run it
* Load it into clusters
* Add `cluster_select`s to the `link_pipeline` in config

At that point I think we've got enough infra to MR the whole of this, linker, data and all. Not a pretty MR, a lot to chew, sorry reviewer.

In [5]:
# '"hmrc"."trade__exporters"': {
#     "fact": '"hmrc"."trade__exporters"',
#     "key_fields": ["company_name", "address", "postcode"],
#     "dim": f'"{os.getenv("SCHEMA")}"."hmrc_trade__exporters__dim"',
#     "n": 3,
#     "experiment": "cm_hmrc-trade-exporters",
# }

In [4]:
cl_x_exp = SplinkLinker(
    dataset = Dataset(
        star_id=54717,
        star=star
    ), 
    probabilities=probabilities, 
    clusters=clusters, 
    n=2
)

  meta = MetaData(self.connectable, schema=schema)


In [5]:
cl_x_exp.get_data(
    cluster_select={
        '"companieshouse"."companies"': [
            "company_name as company_name",
            "postcode as postcode"
        ]
    },
    dim_select=[
        "id",
        "company_name",
        "postcode"
    ]
)

In [None]:
cl_x_exp.prepare(
    cluster_pipeline={
        "clean_comp_names": {
            "function": clean_comp_names,
            "arguments": {
                "primary_col": "company_name",
                "secondary_col": None,
                "stopwords": stopwords,
            },
        }
    },
    dim_pipeline={
        "clean_comp_names": {
            "function": clean_comp_names,
            "arguments": {
                "primary_col": "company_name",
                "secondary_col": None,
                "stopwords": stopwords,
            },
        }
    },
    linker_settings={
        "link_type": "link_only",
        "unique_id_column_name": "id",
        "retain_matching_columns": False,
        "retain_intermediate_calculation_columns": False,
        "blocking_rules_to_generate_predictions": [
            """
                (l.company_name = r.company_name)
                and (
                    l.company_name <> ''
                    and r.company_name <> ''
                )
            """,
            """
                (l.postcode = r.postcode)
                and (
                    l.postcode <> ''
                    and r.postcode <> ''
                )
            """,
        ],
        "comparisons": [
            cl.jaro_winkler_at_thresholds(
                "company_name", [0.9, 0.6], term_frequency_adjustments=True
            ),
            ctl.postcode_comparison("postcode"),
        ],
    },
    train_pipeline={
        "estimate_probability_two_random_records_match": {
            "function": "estimate_probability_two_random_records_match",
            "arguments": {
                "deterministic_matching_rules": """
                    l.company_name = r.company_name
                """,
                "recall": 0.7,
            },
        },
        "estimate_u_using_random_sampling": {
            "function": "estimate_u_using_random_sampling",
            "arguments": {"max_pairs": 1e6},
        },
        "estimate_parameters_using_expectation_maximisation": {
            "function": "estimate_parameters_using_expectation_maximisation",
            "arguments": {
                "blocking_rule": """
                    l.company_name = r.company_name
                """
            },
        },
    }
)

In [8]:
cl_x_exp.save(path=Path(loc.DATA_SUBDIR['raw'], 'ch_x_exp.pickle'))

## Testing lower-level functions

In [5]:
cl_x_exp._clean_data(
    cluster_pipeline={
        "clean_comp_names": {
            "function": clean_comp_names,
            "arguments": {
                "primary_col": "company_name",
                "secondary_col": None,
                "stopwords": stopwords,
            },
        }
    },
    dim_pipeline={
        "clean_comp_names": {
            "function": clean_comp_names,
            "arguments": {
                "primary_col": "company_name",
                "secondary_col": None,
                "stopwords": stopwords,
            },
        }
    }
)

In [6]:
cl_x_exp._substitute_ids()

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])  # type: ignore[arg-type]


In [13]:
# def _register_tables(self):
#     self.con.register('cls', self.cluster_processed)
#     self.con.register('dim', self.dim_processed)

# cl_x_exp._register_tables = types.MethodType(_register_tables, cl_x_exp)

In [7]:
cl_x_exp._register_tables()

In [5]:
cl_x_exp._create_linker(
    linker_settings={
        "link_type": "link_only",
        "unique_id_column_name": "id",
        "retain_matching_columns": False,
        "retain_intermediate_calculation_columns": False,
        "blocking_rules_to_generate_predictions": [
            """
                (l.company_name = r.company_name)
                and (
                    l.company_name <> ''
                    and r.company_name <> ''
                )
            """,
            """
                (l.postcode = r.postcode)
                and (
                    l.postcode <> ''
                    and r.postcode <> ''
                )
            """,
        ],
        "comparisons": [
            cl.jaro_winkler_at_thresholds(
                "company_name", [0.9, 0.6], term_frequency_adjustments=True
            ),
            ctl.postcode_comparison("postcode"),
        ],
    }
)

In [10]:
cl_x_exp._train_linker(
    train_pipeline={
        "estimate_probability_two_random_records_match": {
            "function": "estimate_probability_two_random_records_match",
            "arguments": {
                "deterministic_matching_rules": """
                    l.company_name = r.company_name
                """,
                "recall": 0.7,
            },
        },
        "estimate_u_using_random_sampling": {
            "function": "estimate_u_using_random_sampling",
            "arguments": {"max_pairs": 1e6},
        },
        "estimate_parameters_using_expectation_maximisation": {
            "function": "estimate_parameters_using_expectation_maximisation",
            "arguments": {
                "blocking_rule": """
                    l.company_name = r.company_name
                """
            },
        },
    }
)

INFO:splink.linker:Probability two random records match is estimated to be  2.25e-07.
This means that amongst all possible pairwise record comparisons, one in 4,447,653.50 are expected to match.  With 2,764,864,733,924 total possible comparisons, we expect a total of around 621,645.71 matching pairs
INFO:splink.estimate_u:----- Estimating u probabilities using random sampling -----
INFO:splink.estimate_u:
Estimated u probabilities using random sampling
INFO:splink.settings:
Your model is not yet fully trained. Missing estimates for:
    - company_name (no m values are trained).
    - postcode (no m values are trained).
INFO:splink.em_training_session:
----- Starting EM training session -----

INFO:splink.em_training_session:Estimating the m probabilities of the model by blocking on:

                    l.company_name = r.company_name
                

Parameter estimates will be made for the following comparison(s):
    - postcode

Parameter estimates cannot be made for the following 

In [None]:
cl_x_exp.link(threshold=0.7, log_output=True)