In [1]:
import numpy as np, matplotlib.pyplot as plt, pandas as pd
pd.set_option('display.max_rows', 40)

from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_library as cl

!date
!whoami
!uname -a
!pwd

Wed 16 Nov 2022 04:26:58 PM PST
ndbs
Linux int-slurm-sarchive-p0006 5.4.0-88-generic #99-Ubuntu SMP Thu Sep 23 17:29:00 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
/mnt/share/code/ndbs/vivarium_research_prl/linkage/demo_notebooks


# Following Splink tutorial, referring also to Sam Shin's code

[Splink Tutorial](https://moj-analytical-services.github.io/splink/demos/00_Tutorial_Introduction.html)

[Sam Shin's RecordLinkage repo](https://github.com/SamShin/RecordLinkage)

[Nathaniel Blair-Stahn's fork of Sam's repo](https://github.com/NathanielBlairStahn/RecordLinkageTest)

# Define directories and load decennial census data

In [2]:
project_output_dir = '/mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop'
output_subdir = 'results/special_last_names/florida/2022_10_14_10_49_32/population_table'
output_dir = f'{project_output_dir}/{output_subdir}'

!ls -l $output_dir

total 32224
-rw-rw-r-- 1 albrja   IHME-Simulationscience 12622072 Oct 20 23:08 decennial_census.hdf
-rwxrwxrwx 1 beatrixh IHME-Simulationscience 20364830 Nov 14 16:42 state_table.hdf


In [3]:
decennial_census_path = f'{output_dir}/decennial_census.hdf'
with pd.HDFStore(decennial_census_path, 'r') as census_hdf:
    print(census_hdf.info())
    census_keys = census_hdf.keys()

<class 'pandas.io.pytables.HDFStore'>
File path: /mnt/team/simulation_science/priv/engineering/vivarium_census_prl_synth_pop/results/special_last_names/florida/2022_10_14_10_49_32/population_table/decennial_census.hdf
/year_2020            frame        (shape->[47444,10])
/year_2030            frame        (shape->[46440,10])
/year_2040            frame        (shape->[44626,10])


In [4]:
census_keys

['/year_2020', '/year_2030', '/year_2040']

In [9]:
# Keys exist for years 2020, 2030, 2040
years = [2020, 2030]
census = {year: pd.read_hdf(decennial_census_path, f'year_{year}') for year in years}

for year in years:
    print(year, census[year].shape)

2020 (47444, 10)
2030 (46440, 10)


# See which people appear in one census but not the other

In [10]:
census[2020].index.difference(census[2030].index)

Int64Index([   14,    27,    37,    47,    57,    67,    68,    71,    74,
              101,
            ...
            49928, 49932, 49948, 49955, 49962, 49964, 49969, 49982, 49984,
            49993],
           dtype='int64', length=7153)

In [11]:
census[2030].index.difference(census[2020].index)

Int64Index([   41,    60,    98,   130,   154,   157,   171,   250,   290,
              291,
            ...
            54270, 54271, 54272, 54273, 54274, 54275, 54276, 54277, 54278,
            54279],
           dtype='int64', length=6149)

# Filter to a smaller subset of records to test linking, and add a `'unique_id'` column for Splink

The individual people appear in the same order, unless someone is missing or a new person has been added, so we can filter to the first $n$ records to get similar but not identical lists.

In [12]:
max_index = 10_000

def prep_for_linking(df, max_index):
    return (
        df.loc[:max_index]
        .assign(
            unique_id=lambda df: df.index, # Splink requires a column called 'unique_id'
            age = lambda df: df['age'].floordiv(1).astype(int), # Replace age with floor(age)
        ) 
    )

subcensus = {year: prep_for_linking(df, max_index) for year, df in census.items()}

{year: df.shape for year, df in subcensus.items()}

{2020: (9495, 11), 2030: (8539, 11)}

In [13]:
subcensus[2020]

Unnamed: 0,first_name,last_name,age,date_of_birth,address,zipcode,relation_to_household_head,sex,race_ethnicity,middle_initial,unique_id
0,Margaret,Clark,68,1951-07-27,"1344 winoka rd brooksville, fl",34601,Reference person,Female,Black,J,0
1,Jeffrey,Littlejohn,52,1967-05-03,"927 23rd st clearwater, fl",34698,Reference person,Male,Black,V,1
2,Briana,Jackson,13,2006-09-07,"927 23rd st clearwater, fl",34698,Biological child,Female,Black,A,2
3,Benjamin,Cox,21,1998-10-21,"927 23rd st clearwater, fl",34698,Stepchild,Male,Black,D,3
4,Willie,Tucker,72,1947-10-09,"8904 167th place fleming island, fl",32003,Reference person,Male,White,J,4
...,...,...,...,...,...,...,...,...,...,...,...
9995,Jordy,Thomas,9,2011-03-27,"8 bainridge raod unincorporated, fl",33446,Biological child,Male,White,N,9995
9997,Carl,Saunders,72,1948-01-24,"16901 sw 66 st fort myers, fl",33916,Reference person,Male,White,M,9997
9998,Julie,Hasapis,60,1959-07-23,"16901 sw 66 st fort myers, fl",33916,Opp-sex spouse,Female,White,K,9998
9999,Jason,Cains,23,1996-10-15,"137 belle terre blvd vero beach, fl",32967,Reference person,Male,White,E,9999


In [14]:
subcensus[2030]

Unnamed: 0,first_name,last_name,age,date_of_birth,address,zipcode,relation_to_household_head,sex,race_ethnicity,middle_initial,unique_id
0,Margaret,Clark,70,1951-07-27,"1344 winoka rd brooksville, fl",34601,Reference person,Female,Black,J,0
1,Jeffrey,Littlejohn,53,1967-05-03,"1400 miller ave unincorporated, fl",33428,Reference person,Male,Black,V,1
2,Briana,Jackson,23,2006-09-07,"mirasol drive kendall, fl",33157,Other nonrelative,Female,Black,A,2
3,Benjamin,Cox,31,1998-10-21,"272 hickey road palm coast, fl",32164,Other nonrelative,Male,Black,D,3
4,Willie,Tucker,78,1947-10-09,"8904 167th place fleming island, fl",32003,Reference person,Male,White,J,4
...,...,...,...,...,...,...,...,...,...,...,...
9996,Ashton,Long,16,2013-11-04,"8 bainridge raod unincorporated, fl",33446,Biological child,Male,White,F,9996
9997,Carl,Saunders,74,1948-01-24,"16901 sw 66 st fort myers, fl",33916,Reference person,Male,White,M,9997
9998,Julie,Hasapis,70,1959-07-23,"924 w 675 n jacksonville, fl",32256,Other nonrelative,Female,White,K,9998
9999,Jason,Cains,33,1996-10-15,"137 belle terre blvd vero beach, fl",32967,Reference person,Male,White,E,9999


In [15]:
subcensus[2020].index.difference(subcensus[2030].index)

Int64Index([  14,   27,   37,   47,   57,   67,   68,   71,   74,  101,
            ...
            9960, 9961, 9965, 9968, 9975, 9978, 9981, 9982, 9984, 9994],
           dtype='int64', length=1389)

In [16]:
subcensus[2030].index.difference(subcensus[2020].index)

Int64Index([  41,   60,   98,  130,  154,  157,  171,  250,  290,  291,
            ...
            9686, 9714, 9726, 9736, 9755, 9872, 9903, 9914, 9964, 9996],
           dtype='int64', length=433)

# Look at how addresses change from 2020 to 2030

In [17]:
columns = ['first_name', 'last_name', 'address', 'zipcode']
df1 = subcensus[2020].loc[:,columns]
df2 = subcensus[2030].loc[:,columns]
df1.reindex(df2.index).compare(df2)

Unnamed: 0_level_0,first_name,first_name,last_name,last_name,address,address,zipcode,zipcode
Unnamed: 0_level_1,self,other,self,other,self,other,self,other
1,,,,,"927 23rd st clearwater, fl","1400 miller ave unincorporated, fl",34698,33428
2,,,,,"927 23rd st clearwater, fl","mirasol drive kendall, fl",34698,33157
3,,,,,"927 23rd st clearwater, fl","272 hickey road palm coast, fl",34698,32164
6,,,,,"3478 claude douglas cir jacksonville, fl","1838 kelton ln jacksonville, fl",32218,32224
9,,,,,"94 perry rd orlando, fl","2795 north eagle rd florahome, fl",32832,32140
...,...,...,...,...,...,...,...,...
9991,,,,,"19394 alvaro lane cape coral, fl","7755 berry williams rd ft myers, fl",33993,33913
9993,,,,,"8 bainridge raod unincorporated, fl","944 laurel ave jacksonville, fl",33446,32258
9995,,,,,"8 bainridge raod unincorporated, fl","3736 cameron av st. petersburg, fl",33446,33714
9996,,Ashton,,Long,,"8 bainridge raod unincorporated, fl",,33446


In [18]:
df2.reindex(df1.index).compare(df1)

Unnamed: 0_level_0,first_name,first_name,last_name,last_name,address,address,zipcode,zipcode
Unnamed: 0_level_1,self,other,self,other,self,other,self,other
1,,,,,"1400 miller ave unincorporated, fl","927 23rd st clearwater, fl",33428,34698
2,,,,,"mirasol drive kendall, fl","927 23rd st clearwater, fl",33157,34698
3,,,,,"272 hickey road palm coast, fl","927 23rd st clearwater, fl",32164,34698
6,,,,,"1838 kelton ln jacksonville, fl","3478 claude douglas cir jacksonville, fl",32224,32218
9,,,,,"2795 north eagle rd florahome, fl","94 perry rd orlando, fl",32140,32832
...,...,...,...,...,...,...,...,...
9991,,,,,"7755 berry williams rd ft myers, fl","19394 alvaro lane cape coral, fl",33913,33993
9993,,,,,"944 laurel ave jacksonville, fl","8 bainridge raod unincorporated, fl",32258,33446
9994,,Katherine,,Hoyt,,"8 bainridge raod unincorporated, fl",,33446
9995,,,,,"3736 cameron av st. petersburg, fl","8 bainridge raod unincorporated, fl",33714,33446


# Create a linker and profile some columns

[Exploratory Analysis in Splink](https://moj-analytical-services.github.io/splink/demos/01_Exploratory_analysis.html)

In [19]:
# Minimal settings needed to call .profile_columns()
initial_settings = {'link_type': 'link_only'}
linker = DuckDBLinker([subcensus[2020], subcensus[2030]], initial_settings)

In [20]:
linker.profile_columns(['first_name', 'last_name'])

In [21]:
linker.profile_columns(['address', 'zipcode'])

# Indexing/Blocking

[Blocking in Splink](https://moj-analytical-services.github.io/splink/demos/02_Blocking.html)

From the documentation:

> ### Devising effective blocking rules
>
> The aims of your blocking rules are twofold: 1. Eliminate enough non-matching comparison pairs so your record linkage job is small enough to compute 2. Eliminate as few truly matching pairs as possible (ideally none)
>
> It is usually impossible to find a single blocking rule which achieves both aims, so we recommend using multiple blocking rules.
>
> When we specify multiple blocking rules, Splink will generate all comparison pairs that meet any one of the rules.
>
> ### Blocking rules in Splink
>
> In Splink, blocking rules are specified as SQL expressions.
>
> For example, to generate the subset of record comparisons where the first name matches, we can specify the following blocking rule:
>
> `l.first_name = r.first_name`
>
> Since blocking rules are SQL expressions, they can be arbitrarily complex. For example, you could create record comparisons where the initial of the first name and the surname match with the following rule:
>
> `substr(l.first_name, 1,1) = substr(r.first_name, 1,1) and l.surname = r.surname`

In [22]:
blocking_rules = [
#     'l.zipcode = r.zipcode',
    'l.first_name = r.first_name and l.last_name = r.last_name',
    'l.sex = r.sex and l.zipcode = r.zipcode',
    (
        'l.sex = r.sex'
        ' and abs(round(r.age) - (round(l.age)+10)) <= 1'
#         ' and substr(l.first_name, 1,1) = substr(r.first_name, 1,1)'
        ' and substr(l.last_name, 1,1) = substr(r.last_name, 1,1)'
    ), # Age, sex, and initials of first and last name
]

for rule in blocking_rules:
    count = linker.count_num_comparisons_from_blocking_rule(rule)
    print(f"Number of comparisons generated by '{rule}': {count:,.0f}")

linker.cumulative_num_comparisons_from_blocking_rules_chart(blocking_rules)

Number of comparisons generated by 'l.first_name = r.first_name and l.last_name = r.last_name': 8,252
Number of comparisons generated by 'l.sex = r.sex and l.zipcode = r.zipcode': 90,847
Number of comparisons generated by 'l.sex = r.sex and abs(round(r.age) - (round(l.age)+10)) <= 1 and substr(l.last_name, 1,1) = substr(r.last_name, 1,1)': 93,986


# Make a settings dictionary and do a comparison

[Specifying and estimating a linkage model in Splink](https://moj-analytical-services.github.io/splink/demos/03_Estimating_model_parameters.html)


From [Comparison](https://moj-analytical-services.github.io/splink/comparison.html) documentation page:

> As far as possible, Comparisons should be configured to satisfy the assumption of independece conditional on the true match status, a key assumption of the Fellegi Sunter probabilistic linkage model. This would be broken, for example, if a model contained one Comparison for city, and another for postcode. Instead, in this example, a single comparison should be modelled, which may to capture similarity taking account of both the city and postcode field.


https://moj-analytical-services.github.io/splink/topic_guides/customising_comparisons.html

https://moj-analytical-services.github.io/splink/settingseditor/editor.html

https://moj-analytical-services.github.io/splink/settings_dict_guide.html#probability_two_random_records_match


From Tutorial [03 Estimating model parameters](https://moj-analytical-services.github.io/splink/demos/03_Estimating_model_parameters.html) documentation:

> ### Estimation of `probability_two_random_records_match`
>
> In some cases, the `probability_two_random_records_match` will be known. For example, if you are linking two tables of 10,000 records and expect a one-to-one match, then you should set this value to 1/10_000 [in your settings](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#probability_two_random_records_match) instead of estimating it.
> 
> More generally, this parameter is unknown and needs to be estimated.
> 
> It can be estimated accurately enough for most purposes by combining a series of deterministic matching rules and a guess of the recall corresponding to those rules. For further details of the rationale behind this appraoch see [here](https://github.com/moj-analytical-services/splink/issues/462#issuecomment-1227027995).
>
> In this example, I guess that the following deterministic matching rules have a recall of about 70%:
>
>    ```python
>    deterministic_rules = [
>        "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
>        "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
>        "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
>        "l.email = r.email"
>    ]
>
>    linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)
>    ```

**Question:** Why does this work, and how are you supposed to estimate the recall?

The github issue in referred in the documentation has partial answers, but I think does not explain things sufficiently for someone who doesn't understand the F-S algorithm well: https://github.com/moj-analytical-services/splink/issues/462#issuecomment-1227027995

**Question:** What if you don't make a good guess? Will the EM algorithm perform badly?

From the [settings dict guide documentation on `probability_two_random_records_match`](https://moj-analytical-services.github.io/splink/settings_dict_guide.html#probability_two_random_records_match):

> If you estimate parameters using expectation maximisation (EM), this provides an initial value (prior) from which the EM algorithm will start iterating. EM will then estimate the true value of this parameter.
>
> **Default value**: 0.0001
>
> **Examples**: [1e-05, 0.006]

In my initial test using the records with index under 1000 from the 2020 and 2030 censuses (960 and 865 records, respectively), the true probability of a match is

$$
\frac{\text{number of true matches}}{\text{number of pairs}} = 
\frac{828}{960 \cdot 865} = 0.0009971\ldots \approx 0.001.
$$

Note that if we didn't look at the actual number of records in each data set, and instead just assumed that each of them had the full 1000 records originally specified, then this true probability is quite close to the value we would get using the one-to-one assumption as mentioned in the Splink documentation above, which would be

$$
\frac{\text{number of true matches}}{\text{number of pairs}} =
\frac{1000}{1000 \cdot 1000} = 0.001.
$$

I'm not sure whether there's any good reason for this besides the fact that both datasets still have approximately 1000 records, and the 1:1 assumption should be close to true.

Here's code from my original version of this notebook with records up to 1000:

```
true_probability_two_random_records_match(subcensus[2020], subcensus[2030])
>>0.0009971098265895953

# Assume both datasets had the original specified length of 1000 and matching is 1:1
1/1000
>>0.001

# Compare with true number of matches / inflated denominator
828 / 1000**2
>>0.000828

print(subcensus[2020].shape, subcensus[2030].shape)
>>(960, 11) (865, 11)

# Assume 1:1 match with same of number of records equal to geometric mean
1/(960*865)**(1/2)
>>0.0010973782032653676

# Assume 1:1 match of all records from smaller dataset
1/960
>>0.0010416666666666667
```

In [23]:
s = pd.Series([3.45, 2.984])
s

0    3.450
1    2.984
dtype: float64

In [24]:
s.floordiv(1)

0    3.0
1    2.0
dtype: float64

In [25]:
def true_probability_two_random_records_match(df1, df2):
    size_intersection = len(df1.index.intersection(df2.index))
    print(size_intersection)
    return size_intersection / (len(df1) * len(df2))

true_probability_two_random_records_match(subcensus[2020], subcensus[2030])

8106


9.997803961269055e-05

In [26]:
1/10_000 # Assume both datasets had the original specified length of 1000 and matching is 1:1

0.0001

In [67]:
828 / 1000**2 # Compare with true number of matches / inflated denominator

0.000828

In [27]:
print(subcensus[2020].shape, subcensus[2030].shape)

(9495, 11) (8539, 11)


In [28]:
1/(960*865)**(1/2) # Assume 1:1 match with same of number of records equal to geometric mean

0.0010973782032653676

In [29]:
1/960 # Assume 1:1 match of all records from smaller dataset

0.0010416666666666667

In [31]:
# # Example from https://moj-analytical-services.github.io/splink/topic_guides/customising_comparisons.html
#
# comparison_first_name = {
#     "output_column_name": "first_name",
#     "comparison_description": "First name jaro dmeta",
#     "comparison_levels": [
#         {
#             "sql_condition": "first_name_l IS NULL OR first_name_r IS NULL",
#             "label_for_charts": "Null",
#             "is_null_level": True,
#         },
#         {
#             "sql_condition": "first_name_l = first_name_r",
#             "label_for_charts": "Exact match",
#             "tf_adjustment_column": "first_name",
#             "tf_adjustment_weight": 1.0,
#             "tf_minimum_u_value": 0.001,
#         },
#         {
#             "sql_condition": "dmeta_first_name_l = dmeta_first_name_r",
#             "label_for_charts": "Exact match",
#             "tf_adjustment_column": "dmeta_first_name",
#             "tf_adjustment_weight": 1.0,
#         },
#         {
#             "sql_condition": "jaro_winkler_sim(first_name_l, first_name_r) > 0.8",
#             "label_for_charts": "Exact match",
#             "tf_adjustment_column": "first_name",
#             "tf_adjustment_weight": 0.5,
#             "tf_minimum_u_value": 0.001,
#         },
#         {"sql_condition": "ELSE", "label_for_charts": "All other comparisons"},
#     ],

# }

age_comparison = {
    "output_column_name": "age",
    "comparison_description": "Age difference approximately 10",
    "comparison_levels": [
        {
            "sql_condition": "age_l IS NULL OR age_r IS NULL",
            "label_for_charts": "Null",
            "is_null_level": True,
        },
        {
            "sql_condition": "round(age_r) - round(age_l) = 10",
            "label_for_charts": "Exact match",
        },
        {
            "sql_condition": "abs(round(age_r) - (round(age_l)+10)) = 1",
            "label_for_charts": "Age within 1 year",
        },
        {"sql_condition": "ELSE", "label_for_charts": "Age difference >= 2 years"},
    ],
}

In [32]:
settings = {
    "link_type": "link_only",
    "comparisons": [
        cl.levenshtein_at_thresholds("first_name"),
        cl.levenshtein_at_thresholds("last_name"),
#         c1.exact_match("last_name"),
#         cl.levenshtein_at_thresholds("date_of_birth", 1),
        cl.exact_match("sex"),
        age_comparison,
#         cl.levenshtein_at_thresholds("zipcode"),
    ],
    "blocking_rules_to_generate_predictions": blocking_rules,
    "probability_two_random_records_match":
        true_probability_two_random_records_match(subcensus[2020], subcensus[2030]),
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}

linker.initialise_settings(settings)

8106


## Estimate u probabilities

https://moj-analytical-services.github.io/splink/demos/03_Estimating_model_parameters.html#estimation-of-u-probabilities

In [33]:
%%time
linker.estimate_u_using_random_sampling(target_rows=1e6)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - first_name (no m values are trained).
    - last_name (no m values are trained).
    - sex (no m values are trained).
    - age (no m values are trained).


CPU times: user 2.11 s, sys: 0 ns, total: 2.11 s
Wall time: 1.09 s


## Estimate m probabilities

https://moj-analytical-services.github.io/splink/demos/03_Estimating_model_parameters.html#estimation-of-m-probabilities

Difference between blocking rules for prediction (once the EM algorithm has estimated parameters) vs. estimation (training of EM algorithm):

https://moj-analytical-services.github.io/splink/topic_guides/blocking_rules.html

https://www.robinlinacre.com/comparing_splink_models_unsupervised/

In [34]:
training_blocking_rules = ["l.last_name = r.last_name", 'l.zipcode = r.zipcode', 'l.age + 10 = r.age']
for rule in training_blocking_rules:
    count = linker.count_num_comparisons_from_blocking_rule(rule)
    print(f"Number of comparisons generated by '{rule}': {count:,.0f}")

linker.cumulative_num_comparisons_from_blocking_rules_chart(training_blocking_rules)
# training_session_fname_sname = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rule)


Number of comparisons generated by 'l.last_name = r.last_name': 63,565
Number of comparisons generated by 'l.zipcode = r.zipcode': 179,130
Number of comparisons generated by 'l.age + 10 = r.age': 969,843


In [35]:
%%time
training_sessions = {}
for rule in training_blocking_rules:
    training_sessions[rule] = linker.estimate_parameters_using_expectation_maximisation(training_blocking_rules[1])
    
    


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.zipcode = r.zipcode

Parameter estimates will be made for the following comparison(s):
    - first_name
    - last_name
    - sex
    - age

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.0875 in the m_probability of age, level `Exact match`
Iteration 2: Largest change in params was 0.0867 in the m_probability of age, level `Age difference >= 2 years`
Iteration 3: Largest change in params was 0.000441 in the m_probability of last_name, level `All other comparisons`
Iteration 4: Largest change in params was 0.0006 in the m_probability of last_name, level `All other comparisons`
Iteration 5: Largest change in params was 0.000592 in the m_probability of last_name, level `All other comparisons`
Iteration 6: Largest change in params was 0.000463 in the m_probability of last_name,

CPU times: user 7.58 s, sys: 23.1 ms, total: 7.6 s
Wall time: 4.72 s


## Make an iterator and look at the three training sessions

These all look very similar. I'm not sure how to tell what's going on, e.g. whether the subsequent blocking rules didin't make much difference, as it appears below, or whether the original training session got updated when the later sessions were run.

In [36]:
rule_session = iter(training_sessions.items())

In [37]:
rule, session = next(rule_session)
print(rule)
session.m_u_values_interactive_history_chart()

l.last_name = r.last_name


In [38]:
rule, session = next(rule_session)
print(rule)
session.m_u_values_interactive_history_chart()

l.zipcode = r.zipcode


In [39]:
rule, session = next(rule_session)
print(rule)
session.m_u_values_interactive_history_chart()

l.age + 10 = r.age


# Visualize model parameters

https://moj-analytical-services.github.io/splink/demos/03_Estimating_model_parameters.html#visualising-model-parameters

In [40]:
linker.match_weights_chart()

ValueError: math domain error

In [41]:
linker.m_u_parameters_chart()

ValueError: math domain error

# Look for unlinkable records

https://moj-analytical-services.github.io/splink/demos/03_Estimating_model_parameters.html#detecting-unlinkable-records

The graph shows nothing. Does that mean there are none?

In [42]:
linker.unlinkables_chart()

# Saving the model parameters and reloading them to do prediction/linking

[Saving the model](https://moj-analytical-services.github.io/splink/demos/03_Estimating_model_parameters.html#saving-the-model):

```python
linker.save_settings_to_json("./demo_settings/saved_model_from_demo.json", overwrite=True)
```

[Loading saved model](https://moj-analytical-services.github.io/splink/demos/04_Predicting_results.html#load-estimated-model-from-previous-tutorial):

```python
linker = DuckDBLinker(df) # The demo was for de-duplication, so only one df
linker.load_settings_from_json("./demo_settings/saved_model_from_demo.json")
```

In [43]:
linker.save_settings_to_json(
    "./splink_test_data/saved_model_from_census_test.json",
    overwrite=True
)

# Save input dataframes since they will also be needed to reload the model

In [44]:
subcensus.keys()

dict_keys([2020, 2030])

In [45]:
for year, df in subcensus.items():
    print(df.shape)
    df.to_csv(f"./splink_test_data/census_{year}_test_sample.csv")

(9495, 11)
(8539, 11)


In [46]:
!ls -l splink_test_data/

total 4176
-rw-rw-r-- 1 ndbs Domain Users 1059095 Nov 17 12:57 census_2020_test_sample.csv
-rw-rw-r-- 1 ndbs Domain Users  958764 Nov 17 12:57 census_2030_test_sample.csv
-rw-rw-r-- 1 ndbs Domain Users 1110805 Nov 14 15:01 cluster_studio.html
-rw-rw-r-- 1 ndbs Domain Users 1109719 Nov 14 15:01 comparison_viewer.html
-rw-rw-r-- 1 ndbs Domain Users    5804 Nov 17 12:57 saved_model_from_census_test.json
