In [1]:
from py2neo import Graph

try:
    graph = Graph("neo4j://localhost:7687", auth=("neo4j", "Password"))
    result = graph.run("MATCH (n) RETURN n LIMIT 1")
    print("Connection successful. Data:", result.data())
except Exception as e:
    print("Failed to connect to Neo4j:", str(e)) 

Connection successful. Data: [{'n': Node('Company', companyNumber='09015410')}]


In [11]:
from py2neo import Graph
import random
import pandas as pd
from pandas import json_normalize
import hashlib

def create_groups_for_node(nodes_dict, min_length, max_length, total_length):
    groups = []
    remaining_records = total_length
    # print("HI")
    while groups == []:
        # print("HI")
        remaining_records = total_length
        while remaining_records > 0:
            
            # Set the max possible group size as the minimum of max_length or remaining_records
            max_possible_size = min(max_length, remaining_records)

            # print(min_length, max_possible_size)

            try:

                # Randomly choose a group size between min_length and max_possible_size
                group_size = random.randint(min_length, max_possible_size)

            except ValueError:
                break

            # Append the group size and reduce the remaining records
            groups.append(group_size)
            remaining_records -= group_size

            # Check if the remaining records would violate the constraints if left as a single group
            if remaining_records < min_length and remaining_records > 0:
                # Adjust the last group to absorb the remainder if possible
                if groups[-1] + remaining_records <= max_length:
                    groups[-1] += remaining_records
                    remaining_records = 0
                # else:
            #     return []  # Not possible to meet requirements

    return groups

def create_groups(dicts, group_sizes):
    grouped_dicts = []
    index = 0

    for size in group_sizes:
        group = dicts[index:index + size]  # Create a group of size 'size'
        grouped_dicts.append(group)
        index += size  # Move the index forward by 'size' for the next group

    return grouped_dicts

def create_pseudo_node(node_type, type_dicts_df, required_fields, optional_fields):
    pseudo_node = {}
    # pseudo_node = {"labels": node_type}
    # dicts_df = json_normalize(dicts, sep='_')
    # type_dicts_df = dicts_df[dicts_df['labels'] == node_type] # change the key as per the query

    for required_field in required_fields:
        req_fields_list = type_dicts_df[type_dicts_df[required_field].notna()][required_field].unique().tolist()
        req_field_val = random.choice(req_fields_list)
        pseudo_node[required_field] = req_field_val

    # search for all distinct values of the required fields and choose one for the pseudo-node

    # randomly decide on which optional fields to choose, search for all distinct values for each of the optional fields and then choose one
    opt_fields_num = random.randint(1, len(optional_fields))

    # Randomly choose the selected number of elements from the list (without duplicates)
    random_opt_fields = random.sample(optional_fields, opt_fields_num)

    for optional_field in random_opt_fields:
        opt_fields_list = type_dicts_df[type_dicts_df[optional_field].notna()][optional_field].unique().tolist()
        opt_field_val = random.choice(opt_fields_list)
        pseudo_node[optional_field] = opt_field_val

    return pseudo_node

In [6]:
query = """
MATCH (variable:Person)
RETURN variable
"""

result = graph.run(query)
nodes_as_dicts = [dict(record['variable']) for record in result]

[{'birthMonth': '2', 'nationality': 'American', 'birthYear': '1968', 'name': 'Mr. Charles Perlitz Kempf', 'countryOfResidence': 'England'}, {'birthMonth': '9', 'nationality': 'American', 'birthYear': '1964', 'name': 'Mr. William Howard Wolf Jr', 'countryOfResidence': 'United States'}, {'birthMonth': '3', 'nationality': 'American', 'birthYear': '1962', 'name': 'Mrs Alison Shure', 'countryOfResidence': 'United States'}, {'birthMonth': '12', 'nationality': 'American', 'birthYear': '1961', 'name': 'Mr Stuart Mills', 'countryOfResidence': 'United States'}, {'birthMonth': '3', 'nationality': 'American', 'birthYear': '1971', 'name': 'Mrs Vivienne Cleary Spoerri', 'countryOfResidence': 'England'}]


In [15]:
min_group_length = 1
max_group_length = 5
node_type = "PERSON"
dicts = nodes_as_dicts[:5000]

group_sizes = create_groups_for_node(dicts, min_group_length, max_group_length, len(dicts))
groups = create_groups(dicts, group_sizes)
# print(groups)
list_dict = {f"{i}": sublist for i, sublist in enumerate(groups)}
print(list_dict)

group_wise_pseudo_nodes = {}
dicts_df = pd.DataFrame(dicts)
# required_fields and optional_fields are to be decided after the manual analysis

# after creating groups, create pseudo node for every group of every node type
for key, _ in list_dict.items():
    pseudo_node = create_pseudo_node(node_type, dicts_df, required_fields=["birthMonth", "birthYear"], optional_fields=["nationality"])
    group_wise_pseudo_nodes[key] = pseudo_node

print(group_wise_pseudo_nodes)

{'0': [{'birthMonth': '2', 'nationality': 'American', 'birthYear': '1968', 'name': 'Mr. Charles Perlitz Kempf', 'countryOfResidence': 'England'}], '1': [{'birthMonth': '9', 'nationality': 'American', 'birthYear': '1964', 'name': 'Mr. William Howard Wolf Jr', 'countryOfResidence': 'United States'}, {'birthMonth': '3', 'nationality': 'American', 'birthYear': '1962', 'name': 'Mrs Alison Shure', 'countryOfResidence': 'United States'}, {'birthMonth': '12', 'nationality': 'American', 'birthYear': '1961', 'name': 'Mr Stuart Mills', 'countryOfResidence': 'United States'}, {'birthMonth': '3', 'nationality': 'American', 'birthYear': '1971', 'name': 'Mrs Vivienne Cleary Spoerri', 'countryOfResidence': 'England'}, {'birthMonth': '3', 'nationality': 'American', 'birthYear': '1948', 'name': 'Mr Floyd Eugene Greco', 'countryOfResidence': 'England'}], '2': [{'birthMonth': '12', 'nationality': 'American', 'birthYear': '1955', 'name': 'Mrs Sandra Elizabeth Emms', 'countryOfResidence': 'England'}, {'birt

In [16]:
print(len(group_wise_pseudo_nodes))

1655


In [22]:
private_key = "e8d3cba12a8d4c3b9a12f4e7c5d1a8f2"
print(len(private_key))

# Watermarking:
# need private key, identity, field to be watermarked, fields used while watermarking

def watermark_pseudo_node(pseudo_node, private_key, watermark_identity, attributes, max_num_fields):
    # wm_attribute is a numerical field of a pseudo-node
    # attributes is a list of all the attributes of the pseudo-node
    # watermark_identity = ?
    watermark_secret = watermark_identity + "".join(attributes) + private_key
    hashed_secret = hashlib.sha256(watermark_secret.encode("utf-8")).digest()
    hashed_secret_int = int.from_bytes(hashed_secret, byteorder="big") % max_num_fields
    pseudo_node["hashed_secret"] = hashed_secret_int
    pseudo_node["watermark_id"] = watermark_identity

    return pseudo_node

32


In [23]:
# Parameters
n = len(group_wise_pseudo_nodes)
lower_limit = 1
upper_limit = 10000

# Generate n unique random numbers
unique_random_numbers = random.sample(range(lower_limit, upper_limit + 1), n)

print("Unique random numbers:", unique_random_numbers)

# Watermarking parameters
watermark_identity = unique_random_numbers
# private_key = "e8d3cba12a8d4c3b9a12f4e7c5d1a8f2"
max_num_fields = 1000

# Initialize an empty list to store the watermarked pseudo nodes
watermarked_pseudo_nodes = []

# Initialize a dictionary to store the mapping of unique random numbers to hashed watermarked values
id_list = {}

# Iterate through the pseudo nodes, apply watermarking, and store mappings
for i, node in enumerate(group_wise_pseudo_nodes):
    # Generate the watermarked node
    watermarked_node = watermark_pseudo_node(
        group_wise_pseudo_nodes[node],
        private_key,
        str(watermark_identity[i]),
        [str(group_wise_pseudo_nodes[node]["birthMonth"]), str(group_wise_pseudo_nodes[node]["birthYear"])],
        max_num_fields
    )
    # Append the watermarked node to the list
    watermarked_pseudo_nodes.append(watermarked_node)

    # Map the unique random number to its corresponding watermarked value (hashed)
    id_list[watermark_identity[i]] = watermarked_node['hashed_secret']  # Assuming `watermarked_node` is hashed

# Print the resulting id_list
print("ID List:")
for key, value in id_list.items():
    print(f"Unique Random Number: {key}, Hashed Watermarked Value: {value}")


# Append watermarked pseudo nodes to their respective groups in list_dict
for key, group in list_dict.items():
    # Retrieve the corresponding watermarked pseudo node
    watermarked_node = watermarked_pseudo_nodes[int(key)]

    # Add the watermarked node to the group
    group.append(watermarked_node)

# Print the updated list_dict
print("Updated list_dict with watermarked pseudo nodes:")
for group_key, group_value in list_dict.items():
    print(f"Group {group_key}: {group_value}")

from itertools import chain

# Flatten the list using itertools.chain
full_data = list(chain(*list_dict.values()))

# Now combined_data will be a single list with all records
print(full_data)

Unique random numbers: [4823, 8065, 7043, 2570, 7055, 3571, 6072, 7997, 553, 1547, 1222, 36, 8078, 905, 7874, 3774, 7263, 9567, 5165, 9576, 3332, 7865, 3018, 7343, 5559, 3055, 3731, 5000, 7534, 4826, 8998, 1978, 2911, 6044, 7430, 6840, 5372, 659, 5548, 1224, 163, 4449, 5179, 153, 9794, 3341, 161, 3355, 7476, 401, 8982, 3664, 3768, 5147, 5872, 7799, 4085, 6854, 9433, 4766, 7021, 2607, 501, 4665, 6161, 3825, 5907, 3099, 3813, 6124, 6099, 432, 5688, 1529, 3182, 4387, 3079, 4577, 463, 7485, 1124, 5202, 5383, 7630, 4122, 4400, 3750, 7178, 5721, 2564, 6379, 8707, 9910, 266, 239, 5198, 7648, 323, 213, 1784, 1661, 8272, 3120, 3868, 1898, 4696, 9137, 3473, 5472, 9275, 3366, 9194, 8916, 548, 3584, 8171, 4094, 6743, 7792, 2788, 7690, 7375, 1285, 6128, 7764, 5089, 9851, 2488, 192, 8931, 1311, 9601, 4781, 3863, 8534, 703, 8831, 5888, 9489, 4328, 5158, 9777, 4989, 2829, 146, 3907, 6633, 488, 9405, 7885, 1115, 5137, 5803, 6481, 480, 8120, 5611, 9825, 765, 3116, 2409, 1578, 2374, 5931, 7497, 3578, 693

^C


Collecting embed
  Downloading embed-0.3.0-py3-none-any.whl.metadata (4.5 kB)
Collecting infinity_emb==0.0.58 (from infinity_emb[audio,optimum,torch,vision]==0.0.58->embed)
  Downloading infinity_emb-0.0.58-py3-none-any.whl.metadata (20 kB)
Collecting hf_transfer>=0.1.5 (from infinity_emb==0.0.58->infinity_emb[audio,optimum,torch,vision]==0.0.58->embed)
  Downloading hf_transfer-0.1.8-cp312-none-win_amd64.whl.metadata (1.8 kB)
Collecting huggingface_hub (from infinity_emb==0.0.58->infinity_emb[audio,optimum,torch,vision]==0.0.58->embed)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting optimum>=1.16.2 (from optimum[onnxruntime]>=1.16.2; extra == "optimum" or extra == "all"->infinity_emb[audio,optimum,torch,vision]==0.0.58->embed)
  Downloading optimum-1.23.3-py3-none-any.whl.metadata (20 kB)
Collecting sentence-transformers<4.0.0,>=3.0.1 (from infinity_emb[audio,optimum,torch,vision]==0.0.58->embed)
  Downloading sentence_transformers-3.3.1-py3-none-any.

  You can safely remove it manually.

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:
import random
from faker import Faker
# from embed import *

# Initialize Faker for generating fake data
fake = Faker()

# Expanded lists for cities and occupations for fake data generation
cities = [
    "New York", "Los Angeles", "Chicago", "San Francisco", "Austin", "Boston", "Seattle", "Denver", "Miami",
    "Dallas", "Portland", "Houston", "Phoenix", "Philadelphia", "San Diego", "Atlanta", "Orlando", "Nashville"
]

occupations = [
    "Engineer", "Designer", "Artist", "Manager", "Consultant", "Chef", "Photographer", "Nurse",
    "Software Developer", "Researcher", "Teacher", "Architect", "Analyst", "Lawyer", "Musician",
    "Event Planner"
]

# Function to generate random data with 10% original data and 90% fake data
def create_random_data_with_real(num_entries, original_data, cities, occupations):
    random_data = []
    original_data_count = int(num_entries * 0.1)  # 10% of the data will be real
    fake_data_count = num_entries - original_data_count  # 90% will be fake

    # Add 10% real data
    for _ in range(original_data_count):
        real_entry = random.choice(original_data)
        random_data.append(real_entry)

    # Add 90% fake data
    for _ in range(fake_data_count):
        fake_entry = {
            "name": fake.first_name(),
            "age": random.randint(18, 60),
            "city": random.choice(cities),
            "occupation": random.choice(occupations),
            "salary": random.randint(40, 150) * 1000,  # Salary in thousands
            "married": random.choice([True, False]),
            "hobby": random.choice(["Photography", "Reading", "Traveling", "Cooking", "Hiking", "Gaming", "Dancing"])
        }
        random_data.append(fake_entry)

    return random_data

# Generate 10,000 randomized data entries mixing both fake and real data
randomized_data = create_random_data_with_real(1000, full_data, cities, occupations)

print(len(randomized_data))
# Print first 5 entries to check
for entry in randomized_data[:5]:  # print only the first 5 for brevity
    print(entry)


def validate_watermark_ids(records, id_list, private_key, watermark_identity, max_num_fields):
    # Convert id_list to a set for fast membership checks
    id_set = set(str(key) for key in id_list)  # Use set to avoid duplicates and improve lookup speed

    # Iterate over each record in the list with its index (record number)
    for index, record in enumerate(records):
        # Ensure the record has a 'watermark_id'
        if 'watermark_id' in record:
            watermark_id = str(record['watermark_id'])
            if watermark_id in id_set:
                record_hash = watermark_pseudo_node(record, private_key, str(watermark_id), [str(record["birthMonth"]), str(record["birthYear"])],
                                                    max_num_fields)

                # print("watermark_id", watermark_id)
                # print("generated hashed secret", record_hash["hashed_secret"])
                # print("expected hashed secret", id_list[int(watermark_id)])
                if record_hash['hashed_secret'] == id_list[int(watermark_id)]:
                    print("Valid watermark")
                    break
            else:
                print("No Valid watermark")


# Example usage
validate_watermark_ids(randomized_data, id_list, private_key, watermark_identity, max_num_fields)

1000
{'birthMonth': '1', 'nationality': 'American', 'birthYear': '1996', 'name': 'Ms Kimberly Abadir', 'countryOfResidence': 'United States'}
{'birthMonth': '8', 'nationality': 'American', 'birthYear': '1979', 'name': 'Ms. Milcah E. Ferguson', 'countryOfResidence': 'United Kingdom'}
{'birthMonth': '4', 'nationality': 'American', 'birthYear': '1974', 'name': 'Mr Eric Charles Degolier', 'countryOfResidence': 'United Kingdom'}
{'birthMonth': '10', 'nationality': 'American', 'birthYear': '1973', 'name': 'Ms Lisa Buros', 'countryOfResidence': 'England'}
{'birthMonth': '4', 'birthYear': '1970', 'nationality': 'European American', 'hashed_secret': 289, 'watermark_id': '7384'}
Valid watermark


In [33]:
query = """
MATCH (variable:Company)
RETURN variable.companyNumber AS primaryKey
"""

result = graph.run(query)
primary_keys = [record["primaryKey"] for record in result]
print(primary_keys)

['09015410', '10098027', '03779692', '05148583', '09662530', '04849968', '04179322', '07225997', '10259547', '03585991', '08594648', '04794215', '10260027', '06913548', 'SC345188', '10260365', '10260498', '10260541', '01442908', 'SC453898', '06944392', '02832781', '10204304', '08591941', '10246838', '06778710', '10261448', '10261489', '10261621', '10261689', '01196875', '04475384', '06301369', '03797902', '05484577', '05167720', '09109166', '10261787', '10262075', 'SC220936', '10262141', '10262398', '10262649', '10262763', '10262858', '10262898', '10262957', '10262982', '10263074', '10263079', '10263097', '03601217', '06240919', '09113179', '10263351', '10263374', '10263375', '10263449', '10263569', '10263619', '10263622', '10263696', '10263732', '08128917', '10263982', '04810603', '10264211', '10264270', '06298530', '04425577', '06638475', '10264713', '10264735', '10264873', '08597960', '10265140', '09111510', '04460911', '10265817', '00239143', '08431816', '10265875', '07464159', '05

In [None]:
def validate_watermark_all(self, id_list, attributes):
        # Convert id_list to a set for fast membership checks
        id_set = set(str(key) for key in id_list)  # Use set to avoid duplicates and improve lookup speed
        pseudo_node_count = 0
        # Iterate over each record in the list with its index (record number)
        for index, record in enumerate(self.data):
            # Ensure the record has a 'watermark_id'
            if 'company_id' in record:
                watermark_id = str(record['company_id'])
                if watermark_id in id_set:
                    record_hash, hashed_secret_int = self.embed.watermark_pseudo_node(record, watermark_id,
                                                                                      "company_id",
                                                                                      attributes)
                    if hashed_secret_int == id_list[watermark_id]:
                        pseudo_node_count += 1

        return pseudo_node_count

In [None]:
def verify_watermark(data, id_list, private_key, watermark_identity, max_num_fields):
    """
    Return true if any record contains a valid watermark.
    """
    for record in data:
        if 'company_id' in record:
            valid_watermarks = validate_watermark_all([record], id_list, private_key, watermark_identity, max_num_fields)
            if valid_watermarks > 0:
                return True
    return False

In [None]:
def perform_deletion_attack(data, id_list, private_key, watermark_identity, max_num_fields, step=1):
    """
    Perform deletion attack while tracking valid watermarks.
    """
    total_records = len(data)
    deleted_nodes_percentage = []
    valid_watermarks_detected = []

    iterations = 0
    while data and verify_watermark(data, id_list, private_key, watermark_identity, max_num_fields):
        # Randomly delete `step` records
        for _ in range(step):
            if data:
                data.pop(random.randint(0, len(data) - 1))

        # Calculate the percentage of deleted nodes
        deleted_percentage = ((total_records - len(data)) / total_records) * 100
        deleted_nodes_percentage.append(deleted_percentage)

        # Validate watermarks in the remaining data
        valid_watermarks_records = validate_watermark_all(data, id_list, private_key, watermark_identity, max_num_fields)
        valid_watermarks_detected.append(valid_watermarks_records)  # Append here

        iterations += 1
        print(f"Iteration {iterations}: Remaining records = {len(data)}, Valid Watermarks Detected = {valid_watermarks_records}")

    print(f"Watermark verification failed after {iterations} iterations.")
    return deleted_nodes_percentage, valid_watermarks_detected

# Run the deletion attack
remaining_data = data.copy()  # To preserve the original dataset
deleted_nodes_percentage, valid_watermarks_detected = perform_deletion_attack(remaining_data, id_list, private_key, watermark_identity, max_num_fields, step=2)

# Plotting the results
plt.figure(figsize=(10, 6))
plt.plot(deleted_nodes_percentage, valid_watermarks_detected, marker='o', color='b')

# Adding labels and title
plt.title('Effect of Deletion Attack on Valid Watermarks Detection')
plt.xlabel('Percentage of Deleted Nodes (%)')
plt.ylabel('Number of Valid Watermarks Detected')
plt.grid(True)
plt.show()