In [None]:
import splink.comparison_library as cl
import pandas as pd
import numpy as np
import json
import os, sys
from splink.exploratory import completeness_chart
from splink import DuckDBAPI, Linker, SettingsCreator, block_on, splink_datasets
from splink_tools import *
from pprint import pprint 

def move_working_dir_to_repo_root(repo_name="orgsync"):
    """
    Move the current working directory to the root of the repository.
    """
    current_dir = os.getcwd()
    while os.path.basename(current_dir).lower() != repo_name:
        current_dir = os.path.dirname(current_dir)
    os.chdir(current_dir)
    print("Current working directory: ", os.getcwd())

move_working_dir_to_repo_root(repo_name="orgsync")

Current working directory:  /home/ubuntu/OrgSync


### Check for duplicate ids in datasets

Cleaned
```
len(gtr_df): 69067
len(cordis_df): 36508
uniqu in id gtr: 69067
uniqu in id cordis: 5398
```

raw w/ cordis = just one dataset (FP7, organisation.json)
```
len(gtr_df): 69067
len(cordis_df): 123419
uniqu in id gtr: 69067
uniqu in id cordis: 28177
```

```python
data_dir = os.path.join("data", "raw")
with open(os.path.join(data_dir, "gtr_data.json"), "r", encoding="utf8") as f:
    gtr_json = json.load(f)
with open(os.path.join(data_dir, "uk_data.json"), "r", encoding="utf8") as f:
    cordis_json = json.load(f)

gtr_df = pd.DataFrame(gtr_json)
cordis_df = pd.DataFrame(cordis_json)

print(f"len(gtr_df): {len(gtr_df)}")
print(f"len(cordis_df): {len(cordis_df)}")
print(f"uniqu in id gtr: {len(gtr_df['id'].unique())}")
print(f"uniqu in id cordis: {len(cordis_df['organisationID'].unique())}")

base = os.path.join("data", "raw", "all_scraped")
file_paths = [
    "cordis/2024_07/FP7/organization.json",
    "gtr/scraped/2024_07/organisations.json"
]
with open(os.path.join(base, file_paths[0]), "r", encoding="utf8") as f:
    cordis_json = json.load(f)
with open(os.path.join(base, file_paths[1]), "r", encoding="utf8") as f:
    gtr_json = json.load(f)

cordis_df = pd.DataFrame(cordis_json)
gtr_df = pd.DataFrame(gtr_json)

print(f"len(gtr_df): {len(gtr_df)}")
print(f"len(cordis_df): {len(cordis_df)}")
print(f"uniqu in id gtr: {len(gtr_df['id'].unique())}")
print(f"uniqu in id cordis: {len(cordis_df['organisationID'].unique())}")
```

In [None]:
base_path = os.path.join("data", "splink")
data_path = os.path.join(base_path, "all_data.json")
postcodes_path = os.path.join(base_path, "parsed_postcodes.json")

# load the data
df = pd.read_json(data_path)
df_postcodes = pd.read_json(postcodes_path)
print(f"len(df): {len(df)}")
print(f"len(df_postcodes): {len(df_postcodes)}")
print(f"unique_id in df: {df['unique_id'].nunique()}")
print(f"unique_id in df_postcodes: {df_postcodes['unique_id'].nunique()}")

df.drop_duplicates(inplace=True)
df_postcodes.drop_duplicates(inplace=True)

print(f"len(df): {len(df)}")
print(f"len(df_postcodes): {len(df_postcodes)}")
print(f"unique_id in df: {df['unique_id'].nunique()}")
print(f"unique_id in df_postcodes: {df_postcodes['unique_id'].nunique()}")


def create_unique_identifiers(df, df_postcodes):
    """
    #! This function is no longer needed but kept in case it crops up again.
    The two input datasets contain columns for the same entries in the same order.
    Many entires have the same `unique_id` in both datasets. This function creates a new column
    `global_id` that is unique for each entry in the datasets.

    It then merges the two datasets on this new column.
    """
    assert df["unique_id"].to_list() == df_postcodes["unique_id"].to_list()
    assert df["dataset"].to_list() == df_postcodes["dataset"].to_list()
    
    df["global_id"] = range(len(df))
    df_postcodes["global_id"] = range(len(df_postcodes))
    # convert the global_id to string
    df["global_id"] = df["global_id"].astype(str)
    df_postcodes["global_id"] = df_postcodes["global_id"].astype(str)
    df = df.merge(df_postcodes, on=["global_id", "dataset", "unique_id"], how="left")
    return df

data_unique_ids = df['unique_id'].to_list()
postcodes_unique_ids = df_postcodes['unique_id'].to_list()
data_datasets = df["dataset"].to_list()
postcodes_datasets = df_postcodes["dataset"].to_list()
# check if the unique_ids in the data and postcodes are the same
assert data_unique_ids == postcodes_unique_ids
assert data_datasets == postcodes_datasets

df.head()

len(df): 105573
len(df_postcodes): 105573
unique_id in df: 74508
unique_id in df_postcodes: 74508
len(df): 74508
len(df_postcodes): 74508
unique_id in df: 74508
unique_id in df_postcodes: 74508


Unnamed: 0,unique_id,name,postcode,dataset
0,1906596,nuclear decommissioning authority nda,ca24 3hu,cordis
1,2123954,south west tourism limited,ex2 5wt,cordis
2,1915686,welsh government,cf10 3nq,cordis
3,2128407,terrasalus limited,le15 9el,cordis
4,2168344,ol pharma partners ltd,sg4 7dp,cordis


In [72]:
df_postcodes.head()

Unnamed: 0,unique_id,dataset,parsed.postcode,parsed.original,parsed.incode,parsed.outcode,parsed.area,parsed.district,parsed.sub_district,parsed.sector,parsed.unit,parsed.fix_distance,parsed.is_in_ons_postcode_directory
0,1906596,cordis,CA24 3HU,ca24 3hu,3HU,CA24,CA,CA24,,CA24 3,HU,0.0,1.0
1,2123954,cordis,EX2 5WT,ex2 5wt,5WT,EX2,EX,EX2,,EX2 5,WT,0.0,0.0
2,1915686,cordis,CF10 3NQ,cf10 3nq,3NQ,CF10,CF,CF10,,CF10 3,NQ,0.0,1.0
3,2128407,cordis,LE15 9EL,le15 9el,9EL,LE15,LE,LE15,,LE15 9,EL,0.0,1.0
4,2168344,cordis,SG4 7DP,sg4 7dp,7DP,SG4,SG,SG4,,SG4 7,DP,0.0,1.0


In [None]:
# def pd_create_parsed_postcode_columns(data, postcode_data):
#     """
#     Marge data dn post_code on unique_id

#     Inputs:
#     data: pd.DataFrame
#     postcode_data: pd.DataFrame
#     """
#     # rename "parsed.original" to "postcode"
#     postcode_data = postcode_data.rename(columns={"parsed.original": "postcode"})

#     # merge on index
#     merged_data = pd.merge(data, postcode_data, left_index=True, right_index=True)

#     # data = pd.concat([data, postcode_data], axis=1)
#     return merged_data

df = pd_create_parsed_postcode_columns(df, df_postcodes)
df.head()

Unnamed: 0,unique_id_x,name,postcode_x,dataset_x,parsed.postcode_x,parsed.incode_x,parsed.outcode_x,parsed.area_x,parsed.district_x,parsed.sub_district_x,...,postcode,parsed.incode,parsed.outcode,parsed.area,parsed.district,parsed.sub_district,parsed.sector,parsed.unit,parsed.fix_distance,parsed.is_in_ons_postcode_directory
0,1614374,inlecom systems ltd,,cordis,,,,,,,...,ca24 3hu,3HU,CA24,CA,CA24,,CA24 3,HU,0.0,1.0
1,1614383,london metropolitan university global policy i...,,cordis,,,,,,,...,ex2 5wt,5WT,EX2,EX,EX2,,EX2 5,WT,0.0,0.0
2,1614807,cpl scientific publishing services ltd,,cordis,,,,,,,...,cf10 3nq,3NQ,CF10,CF,CF10,,CF10 3,NQ,0.0,1.0
3,1653759,alkane energy uk limited,,cordis,,,,,,,...,le15 9el,9EL,LE15,LE,LE15,,LE15 9,EL,0.0,1.0
4,1653780,smart moves limited ta city car club,,cordis,,,,,,,...,sg4 7dp,7DP,SG4,SG,SG4,,SG4 7,DP,0.0,1.0


In [43]:
len(df)

14023521

In [44]:
db_api = DuckDBAPI()
# Conver empty lists to null before running...#
completeness_chart(gtr_persons, db_api=db_api, table_names_for_chart=["gtr_persons"])


NameError: name 'gtr_persons' is not defined