In [4]:
import pandas as pd
import random
from faker import Faker

# Initialize Faker for synthetic data generation
fake = Faker('en_CA')  # Use Canadian locale for realistic data

# Predefined list of Canadian provinces and territories
provinces = [
    "AB", "BC", "MB", "NB", "NL", "NS", "NT", "NU", "ON", "PE", "QC", "SK", "YT"
]

# Function to generate realistic business data
def generate_realistic_data(num_records: int) -> pd.DataFrame:
    data = []
    for _ in range(num_records):
        business_name = fake.company()
        postal_code = fake.postalcode()
        city = fake.city()
        province = random.choice(provinces)
        data.append([business_name, postal_code, city, province])
    return pd.DataFrame(data, columns=['business_name', 'postal_code', 'city', 'province'])

# Function to introduce controlled variations (true matches)
def introduce_variations(df: pd.DataFrame, variation_rate: float = 0.1) -> pd.DataFrame:
    varied_data = []
    for _, row in df.iterrows():
        if random.random() < variation_rate:
            business_name = vary_string(row['business_name'])
            postal_code = vary_string(row['postal_code'])
            city = vary_string(row['city'])
            province = row['province']  # Assume province remains unchanged
            varied_data.append([business_name, postal_code, city, province])
        else:
            varied_data.append(row.tolist())
    return pd.DataFrame(varied_data, columns=['business_name', 'postal_code', 'city', 'province'])

# Function to vary strings slightly (simulate typos, abbreviations, etc.)
def vary_string(s: str, error_rate: float = 0.1) -> str:
    s = list(s)
    for i in range(len(s)):
        if random.random() < error_rate:
            s[i] = random.choice('abcdefghijklmnopqrstuvwxyz')
    return ''.join(s)

# Function to create false matches (completely different synthetic records)
def generate_false_matches(num_records: int) -> pd.DataFrame:
    return generate_realistic_data(num_records)

# Main function to generate the final dataset
def generate_synthetic_dataset(num_records: int, variation_rate: float = 0.1, false_match_ratio: float = 0.5) -> pd.DataFrame:
    # Generate base dataset
    base_df = generate_realistic_data(num_records)
    
    # Introduce variations for true matches
    true_matches_df = introduce_variations(base_df, variation_rate)
    
    # Generate false matches
    num_false_matches = int(num_records * false_match_ratio)
    false_matches_df = generate_false_matches(num_false_matches)
    
    # Combine true matches and false matches
    combined_df = pd.concat([true_matches_df, false_matches_df]).reset_index(drop=True)
    
    # Shuffle the combined dataset to mix true and false matches
    combined_df = combined_df.sample(frac=1).reset_index(drop=True)
    
    return combined_df

# Generate the synthetic dataset
synthetic_dataset = generate_synthetic_dataset(num_records=1000, variation_rate=0.2, false_match_ratio=0.5)

# Save the dataset to a CSV file
synthetic_dataset.to_csv('synthetic_business_dataset.csv', index=False)

print(synthetic_dataset)


                       business_name postal_code            city province
0                      Copeland-Cook     R1J 6V1  North Annmouth       NS
1                       Smith-Hoover     R9H 6P1  East Kaylafurt       BC
2     Campoell, Frankyol and Elliktt     PxY 2A5      Port Jakes       NS
3        Herrera, Castillo and Terry      P3N6Y4   West Michelle       BC
4                     Beard and Sons     RdS 9L7    cesz Daleton       BC
...                              ...         ...             ...      ...
1495      Powell, Barrett and Gamble      X9H4A6     Baileyhaven       PE
1496                  Nguyen-Jackson      S4J9L1   West Jennifer       PE
1497                      Miller Ltd      L4L3J7     Port Ashley       YT
1498                   Parker-Graves     G5X 8Y1   South Zachary       BC
1499                       Mason PLC      R2M8T4  Alexanderhaven       SK

[1500 rows x 4 columns]


In [None]:
actually I just realized, this my not be the best way to compare word embedding models to traditional methods, the advantage of word embeddings is they understand context, and if we have

In [6]:
import pandas as pd
import random
from faker import Faker

# Initialize Faker for synthetic data generation
fake = Faker('en_CA')  # Use Canadian locale for realistic data

# Predefined list of Canadian provinces and territories
provinces = [
    "AB", "BC", "MB", "NB", "NL", "NS", "NT", "NU", "ON", "PE", "QC", "SK", "YT"
]

# Function to generate realistic business data
def generate_realistic_data(num_records: int) -> pd.DataFrame:
    data = []
    for _ in range(num_records):
        business_name = fake.company()
        postal_code = fake.postalcode()
        city = fake.city()
        province = random.choice(provinces)
        data.append([business_name, postal_code, city, province])
    return pd.DataFrame(data, columns=['business_name', 'postal_code', 'city', 'province'])

# Function to introduce controlled variations (true matches)
def introduce_variations(df: pd.DataFrame, variation_rate: float = 0.1) -> pd.DataFrame:
    varied_data = []
    for _, row in df.iterrows():
        if random.random() < variation_rate:
            business_name = vary_string(row['business_name'])
            postal_code = vary_string(row['postal_code'])
            city = vary_string(row['city'])
            province = row['province']  # Assume province remains unchanged
            varied_data.append([business_name, postal_code, city, province])
        else:
            varied_data.append(row.tolist())
    return pd.DataFrame(varied_data, columns=['business_name', 'postal_code', 'city', 'province'])

# Function to vary strings slightly (simulate typos, abbreviations, etc.)
def vary_string(s: str, error_rate: float = 0.1) -> str:
    s = list(s)
    for i in range(len(s)):
        if random.random() < error_rate:
            s[i] = random.choice('abcdefghijklmnopqrstuvwxyz')
    return ''.join(s)

# Function to create false matches (completely different synthetic records)
def generate_false_matches(num_records: int) -> pd.DataFrame:
    return generate_realistic_data(num_records)

# Main function to generate two datasets for record linkage
def generate_linkage_datasets(base_records: int, variation_rate: float = 0.1, false_match_ratio: float = 0.5) -> (pd.DataFrame, pd.DataFrame):
    # Generate base dataset
    base_df = generate_realistic_data(base_records)
    
    # Introduce variations for true matches in both datasets
    true_matches_df1 = introduce_variations(base_df, variation_rate)
    true_matches_df2 = introduce_variations(base_df, variation_rate)
    
    # Generate false matches
    num_false_matches = int(base_records * false_match_ratio)
    false_matches_df1 = generate_false_matches(num_false_matches)
    false_matches_df2 = generate_false_matches(num_false_matches)
    
    # Combine true matches and false matches for both datasets
    dataset1 = pd.concat([true_matches_df1, false_matches_df1]).reset_index(drop=True)
    dataset2 = pd.concat([true_matches_df2, false_matches_df2]).reset_index(drop=True)
    
    # Shuffle the combined datasets to mix true and false matches
    dataset1 = dataset1.sample(frac=1).reset_index(drop=True)
    dataset2 = dataset2.sample(frac=1).reset_index(drop=True)
    
    return dataset1, dataset2

# Generate the linkage datasets
dataset1, dataset2 = generate_linkage_datasets(base_records=1000, variation_rate=0.2, false_match_ratio=0.5)

# Save the datasets to CSV files
dataset1.to_csv('linkage_dataset1.csv', index=False)
dataset2.to_csv('linkage_dataset2.csv', index=False)

print("Dataset 1:")
print(dataset1.head())
print("\nDataset 2:")
print(dataset2.head())


Dataset 1:
               business_name postal_code               city province
0            Collins-Simpson     A7L 1L6  Port Rebekahhaven       PE
1             mefersfn-Brswn      f4M9P8     sbw Beverlyttn       PE
2  Perez, Howard and Pearson     C9H 2C1       Matthewhaven       YT
3                Moss-Becker     K8B 4A9    North Jasonfurt       AB
4                Roman-Black     K1V 5R7         Mayborough       PE

Dataset 2:
              business_name postal_code             city province
0  Smith, Patton and Morgan     S5C 8E7       Medinabury       ON
1                   Cox LLC      J2Y1P3     North Robert       NU
2    Mcclure, Gill and Rose     N4J 6P6      Jenkinsbury       AB
3            Price and Sons     A3B 7M8  Lake Aaronmouth       SK
4                 Price Ltd      V1R3J2       Craigmouth       NT
