In [1]:
import pandas as pd
from fuzzywuzzy import fuzz
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_parquet('decennial_census_pairs.parquet')

In [3]:
df.head(1)

Unnamed: 0,simulant_id_1,simulant_id_2,full_name_1,full_name_2,address_1,address_2,city_1,city_2,dob_1,dob_2,sex_1,sex_2,state_1,state_2,race_1,race_2,name_similarity,address_similarity,city_similarity,match
0,0_14147,0_6,Clara Amaya,Elan Alonso Tellez,6877 prospect ave,1501 interlake ave n,Anytown,Anytown,2002-05-20,2013-07-30,Female,Male,WA,WA,Latino,Latino,28,38,100,0


In [4]:

df['dob_1'] = df['dob_1'].astype(str)
df['dob_2'] = df['dob_2'].astype(str)
df['dob_similarity'] = df.apply(lambda row: fuzz.ratio(row['dob_1'], row['dob_2']) / 100, axis=1) # Normalize to 0-1 range
df['state_match'] = np.where(df['state_1'] == df['state_2'], 1, 0)
df['sex_match'] = np.where(df['sex_1'] == df['sex_2'], 1, 0)
df['race_match'] = np.where(df['race_1'] == df['race_2'], 1, 0)
df['match'] = np.where(df['simulant_id_1'] == df['simulant_id_2'], 'M', 'NM')

In [5]:
df['city_similarity'] = df['city_similarity']/100

In [6]:
df.head(1)

Unnamed: 0,simulant_id_1,simulant_id_2,full_name_1,full_name_2,address_1,address_2,city_1,city_2,dob_1,dob_2,...,race_1,race_2,name_similarity,address_similarity,city_similarity,match,dob_similarity,state_match,sex_match,race_match
0,0_14147,0_6,Clara Amaya,Elan Alonso Tellez,6877 prospect ave,1501 interlake ave n,Anytown,Anytown,2002-05-20,2013-07-30,...,Latino,Latino,28,38,1.0,NM,0.6,1,0,1


In [7]:
df.match.value_counts()

match
NM    9150921
M        1860
Name: count, dtype: int64

In [8]:
# Define features (X) and target (y)
features = ['race_match', 'state_match', 'dob_similarity', 'sex_match', 'city_similarity', 'address_similarity', 'name_similarity']
X = df[features]
y = df['match']


In [9]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Adjust test size as needed

In [10]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [15]:
# Train a Random Forest Classifier
model = RandomForestClassifier(random_state=42, class_weight='balanced', n_estimators=100)  # You can tune hyperparameters here
model.fit(X_train, y_train)

In [16]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           M       1.00      1.00      1.00       366
          NM       1.00      1.00      1.00   1830191

    accuracy                           1.00   1830557
   macro avg       1.00      1.00      1.00   1830557
weighted avg       1.00      1.00      1.00   1830557

Accuracy: 1.0


In [13]:
# Example of making predictions on new data
new_data = pd.DataFrame({
    'race_match': [1, 1],
    'state_match': [1, 1],
    'dob_similarity': [0.95, 0.95],
    'sex_match': [1, 1],
    'city_similarity': [0.9, 0.1],
    'address_similarity': [0.85, 0.05],
    'name_similarity': [0.98, 0.1]
})
predictions = model.predict(new_data)
print("Predictions on new data:", predictions)

Predictions on new data: ['NM' 'NM']
