In [50]:
import dspy
from datasets import load_dataset, Dataset
from dspy.datasets import DataLoader
from src.logger import logger
import json
import random
from dotenv import load_dotenv, find_dotenv
import os
import openai
import jellyfish
from itertools import combinations

In [36]:
_ = load_dotenv(find_dotenv())
os.environ['OPENAI_API_KEY'] = os.environ['OPENAI_API_KEY']
openai.api_key = os.environ['OPENAI_API_KEY']

In [39]:
lm = dspy.LM('openai/gpt-4o-mini', api_key=openai.api_key)

In [3]:
dataset = DataLoader().from_csv(file_path='../resources/NC_Voters_5M/**')

In [4]:
random.seed(42)
random.shuffle(dataset)

n = len(dataset)
train_end = int(0.8 * n)
dev_end = train_end + int(0.1 * n)

trainset = dataset[:train_end]
devset = dataset[train_end:dev_end]
testset = dataset[dev_end:]

print(f"Train size: {len(trainset)}")
print(f"Dev size: {len(devset)}")
print(f"Test size: {len(testset)}")

Train size: 4000000
Dev size: 500000
Test size: 500000


In [5]:
example = trainset[0]

In [6]:
print(example.keys())

['recid', 'givenname', 'surname', 'suburb', 'postcode']


In [7]:
print(f'Given name: {example.givenname}')
print(f'Surname: {example.surname}')
print(f'Suburb: {example.suburb}')

Given name: aysha
Surname: mohammed
Suburb: matthews


## Candidate Record Pairs (Blocking)

In [53]:
from collections import defaultdict
from difflib import SequenceMatcher

def jaro_winkler_name_similarity(a, b):
    name1 = f"{a['givenname']} {a['surname']}".lower()
    name2 = f"{b['givenname']} {b['surname']}".lower()
    return jellyfish.jaro_similarity(name1, name2)

def generate_similar_pairs(records, threshold=0.9):
    candidate_pairs = []
    for r1, r2 in combinations(records, 2):
        score = jaro_winkler_name_similarity(r1, r2)
        if score >= threshold:
            candidate_pairs.append((r1, r2, score))
    return candidate_pairs

In [54]:
pairs = generate_similar_pairs(trainset, threshold=0.88)

KeyboardInterrupt: 

In [19]:
# Get a small sample
from itertools import islice
blocks_sm = dict(islice(blocks.items(), 5))
print(len(blocks_sm))

5


In [20]:
candidate_pairs = generate_candidate_pairs(blocks_sm)

In [26]:
candidate_pairs_sm = candidate_pairs[:100]
print(len(candidate_pairs_sm))

100


## Define DSPy Signature
This will be a classification task as we will give two records and it will predict "yes" if they are the same person and "no" otherwise.

In [43]:
from typing import Literal

class MatchVoters(dspy.Signature):
    """Decide whether two voter records are the same person"""
    record_1: str = dspy.InputField()
    record_2: str = dspy.InputField()
    is_match: Literal["yes", "no"] = dspy.OutputField()

In [None]:
# Build DSPy Module

In [41]:
dspy.configure(lm=lm)

In [46]:
predictor = dspy.ChainOfThought(MatchVoters)
a, b = candidate_pairs_sm[1]
print(predictor(record_1=str(a), record_2=str(b)))

Prediction(
    reasoning="The two records represent different individuals. The first record has the given name 'aysha' and surname 'mohammed', while the second record has the given name 'paul' and surname 'slabaugh'. Although both records share the same suburb 'matthews' and postcode '28105', the differing names indicate that they are not the same person.",
    is_match='no'
)
