In [1]:
from icecream import ic
from yfiles_jupyter_graphs_for_kuzu import KuzuGraphWidget
import kuzu
import watermark

In [2]:
%load_ext watermark
%watermark
%watermark --iversions

Last updated: 2025-07-28T13:30:45.401095-07:00

Python implementation: CPython
Python version       : 3.13.3
IPython version      : 9.1.0

Compiler    : Clang 16.0.0 (clang-1600.0.26.6)
OS          : Darwin
Release     : 24.5.0
Machine     : arm64
Processor   : arm
CPU cores   : 14
Architecture: 64bit

kuzu                          : 0.9.0
yfiles_jupyter_graphs_for_kuzu: 0.0.4
watermark                     : 2.5.0



In [3]:
data_path: pathlib.Path = pathlib.Path("data")

NameError: name 'pathlib' is not defined

We'll load a slice of the [OpenSanctions](https://www.opensanctions.org/) dataset, which provides the "risk" category of data.
This describes people and organizations who represent known risks for FinCrime.

In [None]:
df1 = pl.read_ndjson(data_path / "open-sanctions.json")
df1.head(3)

Each entity ID from OpenSanctions has a risk classification. This can be useful to associate an ID with a particular risk, allowing us to narrow down on candidates that are relevant to a particular investigation.

In [None]:
# Get risks from OpenSanctions
df_risk = os.extract_risks(df1)
df_risk.head(3)

We're now ready to extract the open sanctions data. The `extract_open_sanctions` function will take the raw data, process the nested fields within it and return the relevant columns that we need for our graph.

In [None]:
df_os = os.extract_open_sanctions(df1)
df_os.head(3)

Of particular interest for this workshop is the person ["Abassin Badshah"](https://find-and-update.company-information.service.gov.uk/disqualified-officers/natural/mGquuTbmESWiRmHJPz1ObUwfDgk), former owner of multiple Papa John's franchises in London, who is disqualified from being a corporate director until 2026, due to his [tax evasion conviction](https://londonnewsonline.co.uk/news/catford-papa-johns-pizza-boss-jailed-after-669000-tax-evasion/) in 2021.

[Open Ownership](https://www.openownership.org/) describes _ultimate beneficial ownership_ (UBO) details, which provides the "link" category of data. In other words, "Who owns how much of what, and who actually has controlling interest?"

In [None]:
df2 = pl.read_ndjson(data_path / "open-ownership.json")
df2.head(3)

Just like with the OpenSanctions data, we can use the `extract_open_ownership` function to process the nested JSON data and return the relevant columns that we need for our graph.

In [None]:
df_oo = oo.extract_open_ownership(df2)
df_oo.head(3)

For the relationships in our graph, we'll need to select only the relationships that have **both** `src_id` and `dst_id` in the list of ids. This is done via the `extract_open_ownership_relationships` function.

In [None]:
ids = df_oo.select("id").to_series().to_list()
df_oa_relationships = oo.extract_open_ownership_relationships(df2, open_ownership_ids=ids)
df_oa_relationships.head(3)

In [None]:
sz_export = sz.process_senzing_export(data_path / "export.json")

This first dataframe `df_ent` lists the entities identified by Senzing _entity resolution_.

In [None]:
df_ent = sz_export.df_ent.sort("id")
df_ent.head(3)

The `df_rel` dataframe lists probabilistic relationships between entities, also identified by Senzing _entity resolution_. In other words, there isn't sufficient evidence _yet_ to merge these entities; however, there's enough evidence to suggest following these as closely related leads during an investigation.

In [None]:
df_rel = sz_export.df_rel
df_rel.head(3)

### Separate the Senzing entities by source

The final step to preprocess the data for our graph is to separate the entities by their source (whether they come from OpenSanctions or Open Ownership).

In [None]:
df_sz_oo = sz_export.df_rec.filter(pl.col("source") == "OPEN-OWNERSHIP").select("ent_id", "rec_id", "why", "level")
df_sz_oo.head(3)

In [None]:
df_sz_os = sz_export.df_rec.filter(pl.col("source") == "OPEN-SANCTIONS").select("ent_id", "rec_id", "why", "level")
df_sz_os.head(3)

## Extract one fraud network

In [4]:
DB_PATH = "./db"

db = kuzu.Database(DB_PATH)
conn = kuzu.Connection(db)

Create a yFiles graph widget so we can explore our graph interactively

In [5]:
g = KuzuGraphWidget(conn)

In [6]:
g.show_cypher(
    """
    MATCH (a:Entity)-[b *1..3]->(c)
    WHERE a.descrip CONTAINS "Abassin"
    RETURN * LIMIT 100;
    """,
    layout="radial"
)

GraphWidget(layout=Layout(height='650px', width='100%'))

Now let's extract the shell companies in this particular fraud network

In [7]:
res = conn.execute(
    """
    MATCH (a:Entity)-[b *1..3]->(c)
    WHERE a.descrip CONTAINS "Abassin"
      AND c.kind = "ORGANIZATION"
    RETURN c.name, COLLECT(DISTINCT c.addr)
    LIMIT 10;
    """
)

shells: dict = {
    row[0]: row[1]
    for row in res.get_as_pl().iter_rows()
}

ic(shells);

ic| shells: {'BARLLOWS SERVICES LTD': ['3 Market Parade, 41 East Street, Bromley, BR1 1QN',
                                       '31 Quernmore Close, Bromley, Kent, United Kingdom, '
                                       'BR1 4EL'],
             'LMAR (GB) LTD': ['31 Quernmore Close, Bromley, Kent, United Kingdom, BR1 '
                               '4EL'],
             'WELLHANCIA HEALTH CARE LTD': ['31 Quernmore Close, Bromley, BR1 4EL']}
