In [1]:
import pseudopeople as psp
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from fuzzywuzzy import fuzz

In [5]:
df = pd.read_parquet("decennial_census.parquet")

In [6]:
df['address'] = df['street_number'].astype(str) + ' ' + df['street_name'].astype(str) + df['unit_number'].astype(str).fillna('').apply(lambda x: ' Unit ' + x if x else '')
# Improved regex to remove "nan" and preceding " Unit "
df['address'] = df['address'].str.replace(r'\s*Unit\s*nan\b', '', case=False, regex=True).str.strip()

# Combine first and last names
df['full_name'] = df['first_name'].astype(str) + ' ' + df['last_name'].astype(str)
df.drop(['first_name', 'last_name', 'street_number', 'street_name', 'unit_number'], axis=1, inplace=True) # Drop individual name columns and address


# Convert categorical features to numerical using Label Encoding
# le = LabelEncoder()
# for col in ['sex', 'state', 'race_ethnicity', 'full_name']:
#     df[col] = df[col].astype(str)
#     df[col] = le.fit_transform(df[col])

df.dropna(inplace=True)  # Remove rows with NaNs
# print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

In [7]:
df.full_name

31              Clara Amaya
34       Elan Alonso Tellez
36          Kaylee Castillo
40          Caleb Rodriguez
41          Henry Rodriguez
                ...        
33257         Iliana Buskey
33277          Amare Harper
33308        Xavier Atchley
33313          Jace Gaspard
33318       Nolan Hernandez
Name: full_name, Length: 4279, dtype: object

In [8]:
# Convert date of birth to datetime and then to a numerical representation (e.g., timestamp)
df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
# df['date_of_birth'] = df['date_of_birth'].astype(np.int64) // 10**9 # Convert to Unix timestamp

In [11]:
pairs_data = []
for i in range(len(df)):
    for j in range(i + 1, len(df)):  # Avoid duplicate pairs and self-comparisons
        record1 = df.iloc[i]
        record2 = df.iloc[j]

        # Calculate Fuzzy Matching Scores
        name_similarity = fuzz.ratio(record1['full_name'], record2['full_name'])
        address_similarity = fuzz.ratio(record1['address'], record2['address'])
        city_similarity = fuzz.ratio(record1['city'], record2['city'])

        # Determine Match Label (replace with your actual logic)
        # Example: If all similarities are above a threshold, consider it a match.
        threshold = 80  # Example threshold
        if name_similarity > threshold and address_similarity > threshold and city_similarity > threshold and record1['date_of_birth'] == record2['date_of_birth']:
            match = 1
        else:
            match = 0


        pairs_data.append({
            'simulant_id_1': record1['simulant_id'],
            'simulant_id_2': record2['simulant_id'],
            'full_name_1': record1['full_name'],
            'full_name_2': record2['full_name'],
            'address_1': record1['address'],
            'address_2': record2['address'],
            'city_1': record1['city'],
            'city_2': record2['city'],
            'dob_1': record1['date_of_birth'],
            'dob_2': record2['date_of_birth'],
            'sex_1': record1['sex'],
            'sex_2': record2['sex'],
            'state_1': record1['state'],
            'state_2': record2['state'],
            'race_1': record1['race_ethnicity'],
            'race_2': record2['race_ethnicity'],
            'name_similarity': name_similarity,
            'address_similarity': address_similarity,
            'city_similarity': city_similarity,
            'match': match
        })

pairs_df = pd.DataFrame(pairs_data)

print(pairs_df.head().to_markdown(index=False))

|   simulant_id_1 |   simulant_id_2 | full_name_1   | full_name_2        | address_1         | address_2            | city_1   | city_2   | dob_1               | dob_2               | sex_1   | sex_2   | state_1   | state_2   | race_1   | race_2   |   name_similarity |   address_similarity |   city_similarity |   match |
|----------------:|----------------:|:--------------|:-------------------|:------------------|:---------------------|:---------|:---------|:--------------------|:--------------------|:--------|:--------|:----------|:----------|:---------|:---------|------------------:|---------------------:|------------------:|--------:|
|         0_14147 |             0_6 | Clara Amaya   | Elan Alonso Tellez | 6877 prospect ave | 1501 interlake ave n | Anytown  | Anytown  | 2002-05-20 00:00:00 | 2013-07-30 00:00:00 | Female  | Male    | WA        | WA        | Latino   | Latino   |                28 |                   38 |               100 |       0 |
|         0_14147 |          0_

In [14]:
print('original shape', df.shape)
print('new shape', pairs_df.shape)
print('this took 70min')

original shape (4279, 19)
new shape (9152781, 20)
this took 70min


In [16]:
pairs_df.to_parquet('decennial_census_pairs.parquet')