In [57]:
import splink.comparison_library as cl
import pandas as pd
import numpy as np
import json
import os, sys
from splink.exploratory import completeness_chart
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink_tools import *
from pprint import pprint 

def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")

Current working directory:  /home/ubuntu/OrgSync


### Check for duplicate ids in datasets

Cleaned
```
len(gtr_df): 69067
len(cordis_df): 36508
uniqu in id gtr: 69067
uniqu in id cordis: 5398
```

raw w/ cordis = just one dataset (FP7, organisation.json)
```
len(gtr_df): 69067
len(cordis_df): 123419
uniqu in id gtr: 69067
uniqu in id cordis: 28177
```

```python
data_dir = os.path.join("data", "raw")
with open(os.path.join(data_dir, "gtr_data.json"), "r", encoding="utf8") as f:
    gtr_json = json.load(f)
with open(os.path.join(data_dir, "uk_data.json"), "r", encoding="utf8") as f:
    cordis_json = json.load(f)

gtr_df = pd.DataFrame(gtr_json)
cordis_df = pd.DataFrame(cordis_json)

print(f"len(gtr_df): {len(gtr_df)}")
print(f"len(cordis_df): {len(cordis_df)}")
print(f"uniqu in id gtr: {len(gtr_df['id'].unique())}")
print(f"uniqu in id cordis: {len(cordis_df['organisationID'].unique())}")

base = os.path.join("data", "raw", "all_scraped")
file_paths = [
    "cordis/2024_07/FP7/organization.json",
    "gtr/scraped/2024_07/organisations.json"
]
with open(os.path.join(base, file_paths[0]), "r", encoding="utf8") as f:
    cordis_json = json.load(f)
with open(os.path.join(base, file_paths[1]), "r", encoding="utf8") as f:
    gtr_json = json.load(f)

cordis_df = pd.DataFrame(cordis_json)
gtr_df = pd.DataFrame(gtr_json)

print(f"len(gtr_df): {len(gtr_df)}")
print(f"len(cordis_df): {len(cordis_df)}")
print(f"uniqu in id gtr: {len(gtr_df['id'].unique())}")
print(f"uniqu in id cordis: {len(cordis_df['organisationID'].unique())}")
```

In [58]:
base_path = os.path.join("data", "splink")
data_path = os.path.join(base_path, "all_data.json")
postcodes_path = os.path.join(base_path, "parsed_postcodes.json")

# load the data
df = pd.read_json(data_path)
df_postcodes = pd.read_json(postcodes_path)
print(f"len(df): {len(df)}")
print(f"len(df_postcodes): {len(df_postcodes)}")
print(f"unique_id in df: {df['unique_id'].nunique()}")
print(f"unique_id in df_postcodes: {df_postcodes['unique_id'].nunique()}")

df.drop_duplicates(inplace=True)
df_postcodes.drop_duplicates(inplace=True)

print(f"len(df): {len(df)}")
print(f"len(df_postcodes): {len(df_postcodes)}")
print(f"unique_id in df: {df['unique_id'].nunique()}")
print(f"unique_id in df_postcodes: {df_postcodes['unique_id'].nunique()}")


def create_unique_identifiers(df, df_postcodes):
    """
    #! This function is no longer needed but kept in case it crops up again.
    The two input datasets contain columns for the same entries in the same order.
    Many entires have the same `unique_id` in both datasets. This function creates a new column
    `global_id` that is unique for each entry in the datasets.

    It then merges the two datasets on this new column.
    """
    assert df["unique_id"].to_list() == df_postcodes["unique_id"].to_list()
    assert df["dataset"].to_list() == df_postcodes["dataset"].to_list()
    
    df["global_id"] = range(len(df))
    df_postcodes["global_id"] = range(len(df_postcodes))
    # convert the global_id to string
    df["global_id"] = df["global_id"].astype(str)
    df_postcodes["global_id"] = df_postcodes["global_id"].astype(str)
    df = df.merge(df_postcodes, on=["global_id", "dataset", "unique_id"], how="left")
    return df

data_unique_ids = df['unique_id'].to_list()
postcodes_unique_ids = df_postcodes['unique_id'].to_list()
data_datasets = df["dataset"].to_list()
postcodes_datasets = df_postcodes["dataset"].to_list()
# check if the unique_ids in the data and postcodes are the same
assert data_unique_ids == postcodes_unique_ids
assert data_datasets == postcodes_datasets

df.head()

len(df): 105573
len(df_postcodes): 105573
unique_id in df: 74508
unique_id in df_postcodes: 74508
len(df): 74508
len(df_postcodes): 74508
unique_id in df: 74508
unique_id in df_postcodes: 74508


Unnamed: 0,unique_id,name,postcode,dataset
0,1906596,nuclear decommissioning authority nda,ca24 3hu,cordis
1,2123954,south west tourism limited,ex2 5wt,cordis
2,1915686,welsh government,cf10 3nq,cordis
3,2128407,terrasalus limited,le15 9el,cordis
4,2168344,ol pharma partners ltd,sg4 7dp,cordis


In [59]:
df_postcodes.head()

Unnamed: 0,unique_id,dataset,parsed.postcode,parsed.original,parsed.incode,parsed.outcode,parsed.area,parsed.district,parsed.sub_district,parsed.sector,parsed.unit,parsed.fix_distance,parsed.is_in_ons_postcode_directory
0,1906596,cordis,CA24 3HU,ca24 3hu,3HU,CA24,CA,CA24,,CA24 3,HU,0.0,1.0
1,2123954,cordis,EX2 5WT,ex2 5wt,5WT,EX2,EX,EX2,,EX2 5,WT,0.0,0.0
2,1915686,cordis,CF10 3NQ,cf10 3nq,3NQ,CF10,CF,CF10,,CF10 3,NQ,0.0,1.0
3,2128407,cordis,LE15 9EL,le15 9el,9EL,LE15,LE,LE15,,LE15 9,EL,0.0,1.0
4,2168344,cordis,SG4 7DP,sg4 7dp,7DP,SG4,SG,SG4,,SG4 7,DP,0.0,1.0


In [60]:
def pd_create_parsed_postcode_columns(data, postcode_data):
    """
    Marge data dn post_code on unique_id

    Inputs:
    data: pd.DataFrame
    postcode_data: pd.DataFrame
    """

    # merge on index
    merged_data = pd.merge(data, postcode_data, how="left", on=["unique_id", "dataset"]) #! and dataset!

    # data = pd.concat([data, postcode_data], axis=1)
    return merged_data

df = pd_create_parsed_postcode_columns(df, df_postcodes)
df.head()


Unnamed: 0,unique_id,name,postcode,dataset,parsed.postcode,parsed.original,parsed.incode,parsed.outcode,parsed.area,parsed.district,parsed.sub_district,parsed.sector,parsed.unit,parsed.fix_distance,parsed.is_in_ons_postcode_directory
0,1906596,nuclear decommissioning authority nda,ca24 3hu,cordis,CA24 3HU,ca24 3hu,3HU,CA24,CA,CA24,,CA24 3,HU,0.0,1.0
1,2123954,south west tourism limited,ex2 5wt,cordis,EX2 5WT,ex2 5wt,5WT,EX2,EX,EX2,,EX2 5,WT,0.0,0.0
2,1915686,welsh government,cf10 3nq,cordis,CF10 3NQ,cf10 3nq,3NQ,CF10,CF,CF10,,CF10 3,NQ,0.0,1.0
3,2128407,terrasalus limited,le15 9el,cordis,LE15 9EL,le15 9el,9EL,LE15,LE,LE15,,LE15 9,EL,0.0,1.0
4,2168344,ol pharma partners ltd,sg4 7dp,cordis,SG4 7DP,sg4 7dp,7DP,SG4,SG,SG4,,SG4 7,DP,0.0,1.0


In [61]:
# display df where parsed.postcode is null
df[df["parsed.postcode"].isnull()].head(50)
# df[df[""]]

Unnamed: 0,unique_id,name,postcode,dataset,parsed.postcode,parsed.original,parsed.incode,parsed.outcode,parsed.area,parsed.district,parsed.sub_district,parsed.sector,parsed.unit,parsed.fix_distance,parsed.is_in_ons_postcode_directory
30,1653759,alkane energy uk limited,,cordis,,,,,,,,,,,
33,1653780,smart moves limited ta city car club,,cordis,,,,,,,,,,,
34,1653782,mrcmcleanhazel ltd,,cordis,,,,,,,,,,,
35,1653794,university of the west of england,,cordis,,,,,,,,,,,
36,1653798,bath north east somerset council,,cordis,,,,,,,,,,,
37,1653809,powabyke ltd,,cordis,,,,,,,,,,,
38,1653823,advanced communications information systems ltd,,cordis,,,,,,,,,,,
39,1653825,first somerset avon ltd,,cordis,,,,,,,,,,,
40,1653829,advanced transport systems ltd,,cordis,,,,,,,,,,,
41,1614807,cpl scientific publishing services ltd,,cordis,,,,,,,,,,,


In [62]:
# find all rows with postcode.original == "ectified"
# df[df["parsed.original"] == "ectified"]
ec1=df[df["parsed.district"] == "EC1"]
ec1.head(50)

Unnamed: 0,unique_id,name,postcode,dataset,parsed.postcode,parsed.original,parsed.incode,parsed.outcode,parsed.area,parsed.district,parsed.sub_district,parsed.sector,parsed.unit,parsed.fix_distance,parsed.is_in_ons_postcode_directory
14,2142023,dr foster research ltd,ec1a 9la,cordis,EC1A 9LA,ec1a 9la,9LA,EC1A,EC,EC1,EC1A,EC1A 9,LA,0.0,1.0
25,1911839,moorfields eye hospital nhs foundation trust,ec1v 2pd,cordis,EC1V 2PD,ec1v 2pd,2PD,EC1V,EC,EC1,EC1V,EC1V 2,PD,0.0,1.0
32,1923106,delphi diesel systems ltd,ec1a 4dd,cordis,EC1A 4DD,ec1a 4dd,4DD,EC1A,EC,EC1,EC1A,EC1A 4,DD,0.0,0.0
102,1918662,afc energy plc,ec1v 9ee,cordis,EC1V 9EE,ec1v 9ee,9EE,EC1V,EC,EC1,EC1V,EC1V 9,EE,0.0,1.0
129,1909718,city university of london,ec1v 0hb,cordis,EC1V 0HB,ec1v 0hb,0HB,EC1V,EC,EC1,EC1V,EC1V 0,HB,0.0,1.0
296,1920969,tavistock institute of human relations lbg,ec1v 3rs,cordis,EC1V 3RS,ec1v 3rs,3RS,EC1V,EC,EC1,EC1V,EC1V 3,RS,0.0,1.0
555,3048835,skymeter limited,ec1m4jn,cordis,EC1M 4JN,ec1m4jn,4JN,EC1M,EC,EC1,EC1M,EC1M 4,JN,0.0,1.0
634,2348608,newable limited,ec1a 4hy,cordis,EC1A 4HY,ec1a 4hy,4HY,EC1A,EC,EC1,EC1A,EC1A 4,HY,0.0,1.0
740,2376689,f6s network limited,ec1v 2nx,cordis,EC1V 2NX,ec1v 2nx,2NX,EC1V,EC,EC1,EC1V,EC1V 2,NX,0.0,1.0
760,1911543,the centre for economic policy research,ec1v 0dx,cordis,EC1V 0DX,ec1v 0dx,0DX,EC1V,EC,EC1,EC1V,EC1V 0,DX,0.0,1.0


In [63]:
db_api = DuckDBAPI()
# Conver empty lists to null before running...#
completeness_chart(df, db_api=db_api, table_names_for_chart=["dataset"])


Example of checking number of pairs from blocking rule

In [64]:
from splink.blocking_analysis import count_comparisons_from_blocking_rule

columns = [col for col in df.columns]

br = block_on("substr(name, 1,1)", "name")


counts = count_comparisons_from_blocking_rule(
    table_or_tables=df,
    blocking_rule=br,
    link_type="dedupe_only",
    db_api=db_api,
)

counts


{'number_of_comparisons_generated_pre_filter_conditions': 88102,
 'number_of_comparisons_to_be_scored_post_filter_conditions': 6797,
 'filter_conditions_identified': '',
 'equi_join_conditions_identified': 'SUBSTRING(l.name, 1, 1) = SUBSTRING(r.name, 1, 1) AND l."name" = r."name"',
 'link_type_join_condition': 'where l."unique_id" < r."unique_id"'}

## Estimating Model Params
([docs](https://moj-analytical-services.github.io/splink/demos/tutorials/04_Estimating_model_parameters.html))

#### Comparison rules
Check the following docs for guidance on creating pairwise comparisons, and for using presets. 
* [Splink "Comparing Records"](https://moj-analytical-services.github.io/splink/topic_guides/comparisons/comparisons_and_comparison_levels.html)
  * Comparisons and comparison levles,
  * Custom comparisons
  * Out-of-the-box comparisons
  * Term frequency adjustements (more london postcodes than others?)
  * String/ Array comparisons
* [API Docs Comparison Library](https://moj-analytical-services.github.io/splink/api_docs/comparison_library.html)
  * Out-of-the-box comparisons!

Blocking Rules:
* [API Docs](https://moj-analytical-services.github.io/splink/api_docs/blocking.html)

In [65]:
settings = SettingsCreator(
    link_type="dedupe_only",
    comparisons=[
        cl.NameComparison("name"),
        cl.PostcodeComparison("postcode"),
        cl.PostcodeComparison("parsed.postcode"),
        cl.LevenshteinAtThresholds("name", [4]),
        # cl.DamerauLevenshteinAtThresholds("name", "name"),
        
        # cl.ExactMatch("postcode").configure(
        #     term_frequency_adjustments=True
        # )
    ],
    blocking_rules_to_generate_predictions=[
        block_on("name"),
        block_on("postcode"),
        block_on("parsed.postcode"),
        block_on("parsed.area"),
        # block_on("parsed.sector"),
        # block_on("parsed.subdistrict"),
        # block_on("parsed.subsector"),
        # "1=1"
        # "l.name and r.name" #and levenshtein(l.name, r.name) < 4"
        # block_on("name"),
        # block_on("postcode")
    ],
    retain_intermediate_calculation_columns=True
)

deterministic_rules = [
    block_on("name"),
    block_on("postcode"),
    block_on("parsed.postcode"),
    block_on("parsed.area")
    # "1=1"
    # "l.name and r.name"# and levenshtein(l.name, r.name) < 2" #! try this
]


linker = Linker(df, settings, db_api=DuckDBAPI())

linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

linker.training.estimate_u_using_random_sampling(max_pairs=1e6)

training_blocking_rule = block_on("parsed.postcode")
    # "l.name and r.name" #and levenshtein(l.name, r.name) < 4"


training_session_fname_sname = (
    linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)
)



Probability two random records match is estimated to be  0.0256.
This means that amongst all possible pairwise record comparisons, one in 39.01 are expected to match.  With 2,775,683,778 total possible comparisons, we expect a total of around 71,149,021.43 matching pairs
You are using the default value for `max_pairs`, which may be too small and thus lead to inaccurate estimates for your model's u-parameters. Consider increasing to 1e8 or 1e9, which will result in more accurate estimates, but with a longer run time.
----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - name (no m values are trained).
    - postcode (no m values are trained).
    - parsed_postcode (no m values are trained).
    - name (no m values are trained).

----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l."parsed.postcode" = r."parsed.postc

In [66]:
save_dir = os.path.join("results", "splink")
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# iterate file name number to avoid overwriting

save_path = os.path.join(save_dir, f"model_{training_session_fname_sname}.json")

settings = linker.misc.save_model_to_json(
    save_path, overwrite=True
)
linker.evaluation.unlinkables_chart()

In [67]:
linker.visualisations.match_weights_chart()

In [68]:
linker.visualisations.parameter_estimate_comparisons_chart()

# Predicting results

[Docs](https://moj-analytical-services.github.io/splink/demos/tutorials/05_Predicting_results.html)


[API Docs](https://moj-analytical-services.github.io/splink/api_docs/inference.html)

In [82]:
df_predictions = linker.inference.predict(threshold_match_probability=0.2)


Blocking time: 15.75 seconds
Predict time: 14.54 seconds

You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'parsed_postcode':
    m values not fully trained
Comparison: 'name':
    m values not fully trained


### Clustering Results

In [83]:
df_clusters = linker.clustering.cluster_pairwise_predictions_at_threshold(
    df_predictions, threshold_match_probability=0.2
)


Completed iteration 1, num representatives needing updating: 1265
Completed iteration 2, num representatives needing updating: 3373
Completed iteration 3, num representatives needing updating: 364
Completed iteration 4, num representatives needing updating: 246
Completed iteration 5, num representatives needing updating: 218
Completed iteration 6, num representatives needing updating: 113
Completed iteration 7, num representatives needing updating: 188
Completed iteration 8, num representatives needing updating: 9
Completed iteration 9, num representatives needing updating: 23
Completed iteration 10, num representatives needing updating: 1
Completed iteration 11, num representatives needing updating: 1
Completed iteration 12, num representatives needing updating: 0


In [84]:
linker.visualisations.cluster_studio_dashboard(
    df_predictions,
    df_clusters,
    "cluster_studio.html",
    sampling_method="by_cluster_size",
    overwrite=True,
)

from IPython.display import IFrame

IFrame(src="./cluster_studio.html", width="100%", height=1000)

# Visualise 

https://moj-analytical-services.github.io/splink/demos/tutorials/06_Visualising_predictions.html

In [71]:
records_to_view = df_pred.as_record_dict(limit=10)
linker.visualisations.waterfall_chart(records_to_view, filter_nulls=False)

https://moj-analytical-services.github.io/splink/demos/examples/duckdb/comparison_playground.html