In [1]:
%pip install networkx
%pip install pandas
%pip install faker
%pip install uuid
%pip install pycountry

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Synthetic Identity Islands Data Generation

This code generates synthetic identity islands using the `networkx`, `pandas`, and `Faker` libraries. It creates a directed graph representing interconnected identities with various relationships. Below is an overview of the steps involved:

### 1. Import Libraries and Initialize Graph
- Import necessary libraries including `networkx` for graph operations, `pandas` for data handling, `Faker` for generating fake data, and other utilities.
- Initialize a directed graph `G`.

### 2. Define Helper Functions
- **`create_identity_island`**: This function creates an identity island with a base identity and several variations of that identity. It generates multiple names and associates them with the base identity using edges of type `IDENTITY_EQUIVALENCE`.
- **`generate_random_identity_island`**: This function generates a random identity island by selecting a random country code and corresponding locale. It then uses the locale-specific Faker instance to create a base identity and additional identities within the island.

### 3. Country Locale Mapping
- A dictionary `country_locale_map` maps country codes to corresponding Faker locales to ensure name generation aligns with the selected country.

### 4. Generate Identity Islands
- Loop to generate a specified number of identity islands (`num_islands`).
- For each island, `generate_random_identity_island` is called to create and add the island to the graph `G`.

### 5. Add Edges Within Islands
- **`add_edges_within_island`**: This function adds various types of edges within each identity island to simulate realistic relationships and interactions. These edges include `INCLUDED_IN`, `CITED_BY`, `IDENTIFIED_THROUGH_BIOMETRICS`, `IDENTITY_EQUIVALENCE`, `MANUAL_IDENTITY_OVERRIDE`, `IMMIGRATION_STATUS_LINKED`, and `SAME_APPLICATION`.

### 6. Apply Edges to Each Island
- Loop through each identity island and apply the `add_edges_within_island` function to add internal edges.

## Generates Identity Islands

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import random
import uuid
import pickle
import pycountry
from faker import Faker
import numpy as np

In [3]:
# Initialize Faker
fake = Faker()

# Initialize a directed graph
G = nx.DiGraph()

In [4]:
# Function to create identity islands
def create_identity_island(G, locale_fake, base_name, date_of_birth, nationality, num_identities):
    base_identity = {
        'id': uuid.uuid4(),
        'type': 'Identity',
        'name': base_name,
        'age': (pd.to_datetime('today') - pd.to_datetime(date_of_birth, dayfirst=True)).days // 365,
        'date_of_birth': date_of_birth,
        'nationality': nationality
    }
    G.add_node(base_identity['id'], **base_identity)

    second_name = locale_fake.first_name()
    middle_name = locale_fake.last_name()
    
    partial_names = [
        f"{base_name.split()[0]} {base_name.split()[-1]}",
        f"{base_name.split()[0]} {second_name} {base_name.split()[-1]}",
        f"{base_name.split()[0]} {second_name} {middle_name} {base_name.split()[-1]}",
        f"{base_name.split()[0]} {middle_name} {base_name.split()[-1]}",
        # f"{base_name.split()[0]} {middle_name} {base_name.split()[-1]}o"
    ]
    
    identities = [base_identity['id']]
    for partial_name in np.random.choice(partial_names, num_identities):
        identity = {
            'id': uuid.uuid4(),
            'type': 'Identity',
            'name': partial_name,
            'age': base_identity['age'],
            'date_of_birth': date_of_birth,
            'nationality': nationality
        }
        G.add_node(identity['id'], **identity)
        G.add_edge(base_identity['id'], identity['id'], type='IDENTITY_EQUIVALENCE')
        identities.append(identity['id'])
    
    return identities

In [5]:
# Create a dictionary to map country codes to Faker locales
country_locale_map = {
    'AL': 'sq_AL',
    'AM': 'hy_AM',
    'AR': 'es_AR',
    'AT': 'de_AT',
    'AZ': 'az_AZ',
    'BE': 'fr_BE',
    'BG': 'bg_BG',
    'BN': 'bn_BD',
    'BO': 'es',
    'BR': 'pt_BR',
    'CA': 'fr_CA',
    'CH': 'de_CH',
    'CL': 'es_CL',
    'CN': 'zh_CN',
    'CO': 'es_CO',
    'CR': 'es',
    'CS': 'cs_CZ',
    'CU': 'es',
    'DE': 'de',
    'DO': 'es',
    'EC': 'es',
    'EE': 'et_EE',
    'ES': 'es_ES',
    'FI': 'fi_FI',
    'FR': 'fr_FR',
    'GB': 'en_GB',
    'GE': 'ka_GE',
    'GR': 'el_GR',
    'GT': 'es',
    'HN': 'es',
    'HR': 'hr_HR',
    'HU': 'hu_HU',
    'ID': 'id_ID',
    'IE': 'en_IE',
    'IL': 'he_IL',
    'IN': 'en_IN',
    'IR': 'fa_IR',
    'IT': 'it_IT',
    'JP': 'ja_JP',
    'KR': 'ko_KR',
    'MX': 'es_MX',
    'NI': 'es',
    'NL': 'nl_NL',
    'NO': 'no_NO',
    'NZ': 'en_NZ',
    'PA': 'es',
    'PE': 'es',
    'PL': 'pl_PL',
    'PT': 'pt_PT',
    'PY': 'es',
    'RO': 'ro_RO',
    'RU': 'ru_RU',
    'SA': 'ar_SA',
    'SE': 'sv_SE',
    'SI': 'sl_SI',
    'SK': 'sk_SK',
    'SV': 'es',
    'TH': 'th_TH',
    'TR': 'tr_TR',
    'TW': 'zh_TW',
    'UA': 'uk_UA',
    'UY': 'es',
    'VE': 'es',
    'ZA': 'zu_ZA'
}

In [6]:
# Generate a random identity island
def generate_random_identity_island(G):
    # Get a random country code
    country_code = fake.country_code()
    
    # Get the corresponding locale, default to 'en_US' if not found
    locale = country_locale_map.get(country_code, 'en_US')
    
    # Initialize Faker with the specific locale
    locale_fake = Faker(locale)
    
    # Generate name based on locale
    base_name = f"{locale_fake.first_name()} {locale_fake.last_name()}"
    date_of_birth = fake.date_of_birth(minimum_age=1, maximum_age=80).strftime('%d/%m/%Y')
    nationality = pycountry.countries.get(alpha_2=country_code).alpha_3

    return create_identity_island(G, locale_fake, base_name, date_of_birth, nationality, random.randrange(1,6))

In [7]:
# Additional code for edges and anomalies within islands
def add_edges_within_island(G, island):
    reference_nodes = [{'id': uuid.uuid4(), 'type': 'Reference', 'doc_type': random.choice(['PASSPORT', 'NATURALISATION', 'VISA_1', 'VISA_2', 'NATIONAL_IDENTITY_CARD']), 'doc_number': fake.ssn()} for _ in range(5)]
    event_nodes = [{'id': uuid.uuid4(), 'type': 'Event', 'event_type': 'BIOMETRIC_VERIFICATION', 'event_date': fake.date()} for _ in range(2)]
    
    for node in reference_nodes + event_nodes:
        G.add_node(node['id'], **node)
    
    for identity in island:
        # Randomly add INCLUDED_IN edges
        if random.random() < 0.5:
            target = random.choice(island)
            G.add_edge(identity, target, type='INCLUDED_IN')

        # Randomly add CITED_BY edges
        if random.random() < 0.5:
            target = random.choice(reference_nodes)['id']
            G.add_edge(identity, target, type='CITED_BY')

        # Randomly add IDENTIFIED_THROUGH_BIOMETRICS edges
        if random.random() < 0.5:
            target = random.choice(reference_nodes)['id']
            G.add_edge(identity, target, type='IDENTIFIED_THROUGH_BIOMETRICS')

        # Randomly add IDENTITY_EQUIVALENCE edges
        if random.random() < 0.5:
            target = random.choice(island)
            G.add_edge(identity, target, type='IDENTITY_EQUIVALENCE')

        # Randomly add MANUAL_IDENTITY_OVERRIDE edges
        if random.random() < 0.5:
            target = random.choice(island)
            G.add_edge(identity, target, type='MANUAL_IDENTITY_OVERRIDE')

        # Randomly add IMMIGRATION_STATUS_LINKED edges
        if random.random() < 0.5:
            target = random.choice(event_nodes)['id']
            G.add_edge(identity, target, type='IMMIGRATION_STATUS_LINKED')

        # Randomly add SAME_APPLICATION edges
        if random.random() < 0.5:
            target = random.choice(island)
            G.add_edge(identity, target, type='SAME_APPLICATION')

## Create multiple identity islands

In [8]:
# Create multiple identity islands
identity_islands = []
num_islands = 10
for _ in range(num_islands):
    island = generate_random_identity_island(G)
    identity_islands.append(island)

# Add edges within each island
for island in identity_islands:
    add_edges_within_island(G, island)

print("Synthetic data generation complete.")

Synthetic data generation complete.


## Save Graph

In [9]:
# Check the number of nodes and edges
print(f"Number of nodes in the graph: {G.number_of_nodes()}")
print(f"Number of edges in the graph: {G.number_of_edges()}")

# Save the graph to a file using pickle
with open('../data/synthetic_data/synthetic_identity_islands.gpickle', 'wb') as f:
    pickle.dump(G, f, pickle.HIGHEST_PROTOCOL)

print("Graph have been saved.")

Number of nodes in the graph: 113
Number of edges in the graph: 153
Graph have been saved.


In [10]:
# Save identity islands to a file using pickle
with open('../data/synthetic_data/identity_islands.pkl', 'wb') as f:
    pickle.dump(identity_islands, f, pickle.HIGHEST_PROTOCOL)

print("Identity islands have been saved.")

Identity islands have been saved.


## Display Identity Island

In [11]:
def extract_identity_islands(G):
    identity_islands = []
    for subgraph in nx.connected_components(G.to_undirected()):
        identities = [n for n in subgraph if G.nodes[n]['type'] == 'Identity']
        if len(identities) > 1:
            identity_islands.append(identities)
    return identity_islands

def print_identity_islands(G):
    identity_islands = extract_identity_islands(G)
    for i, island in enumerate(identity_islands, start=1):
        print(f"Identity Island {i}:")
        for identity in island:
            node_data = G.nodes[identity]
            print(f"  ID: {identity}, Name: {node_data['name']}, DOB: {node_data['date_of_birth']}, Nationality: {node_data['nationality']}")
        print()

# Extract and print identity islands
print_identity_islands(G)

Identity Island 1:
  ID: c950fc8a-09a5-4fe5-89f5-b3ace8fe2727, Name: Katherine Hughes Glass, DOB: 11/04/1989, Nationality: KGZ
  ID: 9dcbecaa-f0b1-415f-9843-9c218bbc4466, Name: Katherine Larry Hughes Glass, DOB: 11/04/1989, Nationality: KGZ
  ID: 8da377c7-4b90-46a1-88d2-5912a565765d, Name: Katherine Larry Glass, DOB: 11/04/1989, Nationality: KGZ
  ID: 3f9fa0ea-d962-40f5-89cd-8253c94db91c, Name: Katherine Glass, DOB: 11/04/1989, Nationality: KGZ
  ID: ca798061-f5ea-4040-90d8-5a0d465db622, Name: Katherine Hughes Glass, DOB: 11/04/1989, Nationality: KGZ

Identity Island 2:
  ID: 7a267725-130f-42c3-b438-530b7ef726c5, Name: Barbara Baker, DOB: 11/08/2011, Nationality: UZB
  ID: 39e58dd6-cb83-43bb-8864-1a487dfca258, Name: Barbara April Evans Baker, DOB: 11/08/2011, Nationality: UZB
  ID: 56c47586-509b-4e5a-9b8f-d2467a219375, Name: Barbara April Baker, DOB: 11/08/2011, Nationality: UZB

Identity Island 3:
  ID: 8a080e07-79cc-4a74-a365-4bebcba2ba32, Name: John Sherman, DOB: 28/10/1955, Nationa

## Display One Identity

In [12]:
# Update the identity_nodes list to ensure it is current
identity_nodes = [n for n, d in G.nodes(data=True) if d['type'] == 'Identity']

# Choose a random identity node to inspect
random_identity_node = random.choice(identity_nodes)
# random_identity_node = uuid.UUID('77d262f1-a8fe-4347-acf5-1fa956f039be')

# Function to print node details and its edges
def print_identity_island(graph, node_id):
    node_details = graph.nodes[node_id]
    print(f"Node ID: {node_id}")
    print("Node Details:")
    print(node_details)
    
    # Incoming edges
    print("\nIncoming Edges:")
    for u, v, data in graph.in_edges(node_id, data=True):
        print(f"{u} -> {v} (Type: {data['type']})")
    
    # Outgoing edges
    print("\nOutgoing Edges:")
    for u, v, data in graph.out_edges(node_id, data=True):
        print(f"{u} -> {v} (Type: {data['type']})")

# Call the function to print the details of the chosen node and its edges
print_identity_island(G, random_identity_node)

Node ID: 317a74de-1e35-412c-b62a-8a7e3dab740e
Node Details:
{'id': UUID('317a74de-1e35-412c-b62a-8a7e3dab740e'), 'type': 'Identity', 'name': 'Douglas Peters', 'age': 49, 'date_of_birth': '16/04/1975', 'nationality': 'DJI'}

Incoming Edges:
3f755982-af85-45de-878e-40eec96a0643 -> 317a74de-1e35-412c-b62a-8a7e3dab740e (Type: IDENTITY_EQUIVALENCE)
fdab1097-87eb-411d-9150-04eb2e4c3d16 -> 317a74de-1e35-412c-b62a-8a7e3dab740e (Type: INCLUDED_IN)
317a74de-1e35-412c-b62a-8a7e3dab740e -> 317a74de-1e35-412c-b62a-8a7e3dab740e (Type: MANUAL_IDENTITY_OVERRIDE)
d3d87d3d-8ad5-45a0-a63e-1e76265c7991 -> 317a74de-1e35-412c-b62a-8a7e3dab740e (Type: IDENTITY_EQUIVALENCE)

Outgoing Edges:
317a74de-1e35-412c-b62a-8a7e3dab740e -> 3ed06c04-e2c1-4b54-8295-67e4068bb543 (Type: IDENTIFIED_THROUGH_BIOMETRICS)
317a74de-1e35-412c-b62a-8a7e3dab740e -> 317a74de-1e35-412c-b62a-8a7e3dab740e (Type: MANUAL_IDENTITY_OVERRIDE)
317a74de-1e35-412c-b62a-8a7e3dab740e -> fdab1097-87eb-411d-9150-04eb2e4c3d16 (Type: SAME_APPLICATIO