In [None]:
import pandas as pd
import numpy as np
from faker import Faker

#Setup faker for data generation
fake = Faker()

#Function to create realistic customer data
def create_person_data(num_people, shared_ids=[]):
    data = []
    for _ in range(num_people):
        person_id = np.random.randint(1000000, 9999999)
        if shared_ids:
            person_id = shared_ids.pop(0)
        person = {
            "Name": fake.name(),
            "DOB": fake.date_of_birth(minimum_age=16, maximum_age=120),
            "ID": person_id,
            "Address": fake.address(),
            "Country": fake.country()
        }
        data.append(person)
    return pd.DataFrame(data)

#Function to create suspect data with some shared IDs
def create_suspect_data(num_suspects, shared_ids):
    data = []
    #10% should having matching ID of suspects
    num_shared = int(0.1 * num_suspects)
    for i in range(num_suspects):
        if i < num_shared:
            person_id = shared_ids[i]
        else:
            person_id = np.random.randint(1000000, 9999999)
        person = {
            "Name": fake.name(),
            "DOB": fake.date_of_birth(minimum_age=16, maximum_age=120),
            "ID": person_id,
            "Address": fake.address(),
            "Country": fake.country()
        }
        data.append(person)
    return pd.DataFrame(data)

#Clear text matching function for individual
def match_plain(tar, sus):
    if tar == sus:
        return f"Person ID {tar} has matched with suspect ID: {sus}"

#Clear text matching for list
def listmatch_plain(tar_list, sus_list):
    matches = []
    for tar in tar_list:
        for sus in sus_list:
            match_res = match_plain(tar, sus)
            if match_res:
                matches.append(match_res)
    return matches

#Generate sample data
num_people = 100
num_suspects = 10

#Generate shared IDs to ensure 10% matches
shared_ids = [np.random.randint(1000000, 9999999) for _ in range(int(0.1 * num_suspects))]

#Create customer and suspect data with shared IDs
customer_data = create_person_data(num_people, shared_ids.copy())
suspect_data = create_suspect_data(num_suspects, shared_ids)

#Convert data to lists for matching
customer_ids = customer_data['ID'].to_list()
suspect_ids = suspect_data['ID'].to_list()

#Perform plain text matching
plain_matches = listmatch_plain(customer_ids, suspect_ids)
print("Plain Text Matches:")
for match in plain_matches:
    print(match)


Plain Text Matches:
Person ID 3757234 has matched with suspect ID: 3757234
