In [3]:
import splink.comparison_library as cl
from splink.exploratory import completeness_chart
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink_tools import *
from pprint import pprint 

In [None]:
import os, sys
import pandas as pd
import numpy as np

def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")


base_path = os.path.join("data", "transformed")
gtr_persons = pd.read_csv(os.path.join(base_path, "persons.csv"))
gtr_orgs = pd.read_csv(os.path.join(base_path, "organisations.csv"))

Current working directory:  c:\Users\dec2g\GitHub\OrgSync
{'address.postCode': 'NE1 8QH',
 'address.region': 'North East',
 'address.type': 'MAIN_ADDRESS',
 'id': '5331B126-3AB4-4412-B56D-00E8F2796556',
 'link.EMPLOYEE': ['0CB4A538-AC14-4394-A10F-C9F955033EF3'],
 'name': 'NEWCASTLE CITY COUNCIL'}
{'city': 'MOOR ROW',
 'geolocation': '',
 'name': 'NUCLEAR DECOMMISSIONING AUTHORITY - NDA',
 'nutsCode': '',
 'organisationID': 999565019,
 'postCode': 'CA24 3HU',
 'shortName': 'NDA',
 'street': 'Westlakes Science PArk - Herdus House'}


In [None]:

# the following should be modified if the column names change
org_cols = [
    "name",
    "EMPLOYEE_ids", # person ids
    "PROJECT_ids",  # project ids
    "organisation_id"

]
per_cols = [
    "person_id", 
    "firstName",
    "surname",
    "otherNames",
    # "email",
    # "orcidID",
    "EMPLOYED_ids", # org ids
    "PI_PER_ids",   # project ids
    "COI_PER_ids",  # project ids
    # "created",
]

# The following are used to specifify how the coumns should be processed. 
# This is done because list columns need to be joined before processing for use with splink. 
org_list_cols =[
    "EMPLOYEE_ids",
    "PROJECT_ids",
]

per_list_cols = [
    "EMPLOYED_ids",
    "PI_PER_ids",
    "COI_PER_ids",
]

org_str_cols = [
    "name",
]

per_str_cols = [
    "firstName",
    "surname",
    "otherNames",
]

gtr_orgs = remove_df_columns_not_in_list(gtr_orgs, org_cols)
gtr_persons = remove_df_columns_not_in_list(gtr_persons, per_cols)

for col in org_list_cols:
    print(check_column_datatypes(gtr_orgs, col))
    gtr_orgs = convert_column_str_list_to_list(gtr_orgs, col)

for col in per_list_cols:
    print(check_column_datatypes(gtr_persons, col))
    gtr_persons = convert_column_str_list_to_list(gtr_persons, col)

EMPLOYEE_ids
<class 'float'>    54720
<class 'str'>      14347
Name: count, dtype: int64
PROJECT_ids
<class 'str'>      67239
<class 'float'>     1828
Name: count, dtype: int64
EMPLOYED_ids
<class 'str'>    133565
Name: count, dtype: int64
PI_PER_ids
<class 'float'>    96842
<class 'str'>      36723
Name: count, dtype: int64
COI_PER_ids
<class 'float'>    80524
<class 'str'>      53041
Name: count, dtype: int64


In [4]:

# gtr_orgs = blank_to_nan(gtr_orgs, "name")
# gtr_orgs = empty_list_to_nan(gtr_orgs, "EMPLOYEE")

In [5]:
gtr_orgs.head()


Unnamed: 0,name,PROJECT_ids,EMPLOYEE_ids
0,NEWCASTLE CITY COUNCIL,"[0D5DF2FF-B732-4218-B0E3-4FFBF3DDC906, 0D0F72C...",[0CB4A538-AC14-4394-A10F-C9F955033EF3]
1,VALERANN UK LIMITED,"[B012B0D9-EEBC-414F-BA4A-8AF2B1898477, B0359A8...",[951DAD79-185A-4009-B3AE-1717EC2AF063]
2,Baltic Sea Cultural Centre in Gdansk,"[6272E5E8-1321-48E3-B91B-63E358F4B4FD, 34D4F9E...",[]
3,Mindray,[86D3E475-E582-4D08-8D13-99CEEFC43E17],[]
4,Democracy International,[B6328B17-69B0-4044-B156-0A26D338A5CE],[]


In [45]:
gtr_persons.head(6)

Unnamed: 0,person_id,PI_PER_ids,COI_PER_ids,EMPLOYED_ids,firstName,surname
0,0400AE80-674B-4068-A7D4-748DFA887CDE,"[1FF7C213-559F-4374-9932-069540778CFE, CF64FA1...","[A69BBDA8-5CE1-41A2-872C-122079954D8F, 86E4C8C...",[01F2924C-FFB4-481B-B8F0-31234D33F0FA],Tom,Tregenza
1,0400FA13-4E69-4BB4-AACC-B8E13051441D,[F8DE3E25-99F4-4C8F-A5DE-511C98674535],[6734F477-6EE2-4082-9765-ABB040C34AE1],[AFE1E6E9-738F-4616-9D07-0781DAC34046],Marianne,Yon
2,04F7BEB3-2C6E-4027-8B28-3CDB92DA0787,"[B5879D1D-8A34-44B1-9EA0-08FC0B4DDB1A, 4D01024...",[4163D0A1-31AE-40A7-89F5-1038BA805963],[6BE9EFE8-67BC-4376-9142-6AE1A9AA6498],Semira,Manaseki-Holland
3,04FC1BF4-0F97-4020-AEE6-0C4265BD96BE,[BDEB0211-F9D6-4F8B-80A1-5723764A22FB],"[8C11A853-2407-4BD8-8063-48B8A2EC010E, 5D6AEA7...",[595A5FEA-6A63-4445-BD0B-7CA0CC2EE7DF],Julia,Edgar
4,05031B56-1BEB-4FE5-9FE3-ECA990789279,"[80386E87-6008-484D-96D6-16751FBC1BCE, F9CF650...","[65793E04-59E6-4C98-97C7-118FD0D97EFE, 1B2B1A7...",[7FF630E0-3355-4076-9E47-528D4B0DBCB3],Mark,Leake
5,051CC788-B5C8-4734-82E9-FD6A430F23E4,,,[E4BC926F-50DE-44EE-AFEB-5A9C41512F4B],Oonagh,Markey


# Try just gtr_persons as a test, to see if we can match person_ids
# first combine the the lists from each row in PI_PER_ids and COI_PER_ids 



In [40]:
### Remove columns with very low linkage

for col in per_list_cols:
    gtr_persons = empty_list_to_nan_full(gtr_persons, col)

db_api = DuckDBAPI()
# Conver empty lists to null before running...#
completeness_chart(gtr_persons, db_api=db_api, table_names_for_chart=["gtr_persons"])




In [44]:
chart = completeness_chart(gtr_persons, db_api=db_api, table_names_for_chart=["gtr_persons"])
# pprint(chart.to_dict())

chart_dict = chart.to_dict()

# dataset = chart_dict.get("data")["name"]

for dataset, completeness  in chart_dict.get("datasets").items():
    # print(dataset)
    for col_info in completeness:
        if col_info["completeness"] == 0:
            # remove col from df
            gtr_persons.drop(col_info["column_name"], axis=1, inplace=True)
            print("\n")
    print("\n")

gtr_persons.head()





Unnamed: 0,person_id,PI_PER_ids,COI_PER_ids,EMPLOYED_ids,firstName,surname
0,0400AE80-674B-4068-A7D4-748DFA887CDE,"[1FF7C213-559F-4374-9932-069540778CFE, CF64FA1...","[A69BBDA8-5CE1-41A2-872C-122079954D8F, 86E4C8C...",[01F2924C-FFB4-481B-B8F0-31234D33F0FA],Tom,Tregenza
1,0400FA13-4E69-4BB4-AACC-B8E13051441D,[F8DE3E25-99F4-4C8F-A5DE-511C98674535],[6734F477-6EE2-4082-9765-ABB040C34AE1],[AFE1E6E9-738F-4616-9D07-0781DAC34046],Marianne,Yon
2,04F7BEB3-2C6E-4027-8B28-3CDB92DA0787,"[B5879D1D-8A34-44B1-9EA0-08FC0B4DDB1A, 4D01024...",[4163D0A1-31AE-40A7-89F5-1038BA805963],[6BE9EFE8-67BC-4376-9142-6AE1A9AA6498],Semira,Manaseki-Holland
3,04FC1BF4-0F97-4020-AEE6-0C4265BD96BE,[BDEB0211-F9D6-4F8B-80A1-5723764A22FB],"[8C11A853-2407-4BD8-8063-48B8A2EC010E, 5D6AEA7...",[595A5FEA-6A63-4445-BD0B-7CA0CC2EE7DF],Julia,Edgar
4,05031B56-1BEB-4FE5-9FE3-ECA990789279,"[80386E87-6008-484D-96D6-16751FBC1BCE, F9CF650...","[65793E04-59E6-4C98-97C7-118FD0D97EFE, 1B2B1A7...",[7FF630E0-3355-4076-9E47-528D4B0DBCB3],Mark,Leake


# Splink - Blocking Rules


In [None]:
# gtr_orgs now contains PROJECT_ids, and EMPLOYEE_ids as lists of strings representing unique identifiers.
# Similarly, gtr_persons now contains EMPLOYED_ids, PI_PER_ids, and COI_PER_ids as lists of strings representing unique identifiers.
# We now want to create a new dataframe from gtr_orgs, which replacing PI_PER_ids




EMPLOYEE_ids
<class 'float'>    54720
<class 'str'>      14347
Name: count, dtype: int64
PROJECT_ids
<class 'str'>      67239
<class 'float'>     1828
Name: count, dtype: int64
EMPLOYED_ids
<class 'str'>    133565
Name: count, dtype: int64
PI_PER_ids
<class 'float'>    96842
<class 'str'>      36723
Name: count, dtype: int64
COI_PER_ids
<class 'float'>    80524
<class 'str'>      53041
Name: count, dtype: int64


In [12]:
def return_rows_in_column_with_dtype(df, col, dtype):
    return df[df[col].apply(lambda x: isinstance(x, dtype))]

float_rows = return_rows_in_column_with_dtype(gtr_orgs, "EMPLOYEE_ids", float)
print(float_rows)

                                       name  \
2      Baltic Sea Cultural Centre in Gdansk   
3                                   Mindray   
4                   Democracy International   
5                  Wimbledon College of Art   
6                           LICX UK LIMITED   
...                                     ...   
69059                       POSTBIOTICS INC   
69061           SPEARHEAD MARKETING LIMITED   
69063      Liverpool John Moores University   
69064              University of Queensland   
69065                           Disyn Biotc   

                                             PROJECT_ids EMPLOYEE_ids  
2      ['6272E5E8-1321-48E3-B91B-63E358F4B4FD', '34D4...          NaN  
3               ['86D3E475-E582-4D08-8D13-99CEEFC43E17']          NaN  
4               ['B6328B17-69B0-4044-B156-0A26D338A5CE']          NaN  
5      ['7BFFAA57-86BD-423E-A20E-DE07A97F1366', '2616...          NaN  
6               ['A1026C17-835C-4C5E-848D-C592CCB035E7']          NaN  
...

In [13]:
check_column_datatypes(gtr_orgs, "EMPLOYEE")

EMPLOYEE
<class 'float'>    54720
<class 'list'>     14347
Name: count, dtype: int64

### Analyse Missingness

It's important to understand the level of missingness in your data, because columns with higher levels of missingness are less useful for data linking.

In [6]:
db_api = DuckDBAPI()
completeness_chart(gtr_orgs, db_api=db_api)
# Why is employee 100% complete? There should be a bunch of NaNs. 

# EDA


In [7]:
from splink.exploratory import profile_columns

profile_columns(gtr_orgs, db_api=DuckDBAPI(), top_n=10, bottom_n=5)

