In [21]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from tqdm import tqdm
import pandas as pd
import pickle
import copy
import json
import glob
import time
import io

# Matching

In [2]:
names = []
with open('./westlaw/westlaw_names.txt', 'r', encoding='utf-8') as f:
    for line in f:
        names.append(line.strip().capitalize())

with open('./traffic/incident_names.txt', 'r', encoding='utf-8') as f:
    for line in f:
        names.append(line.strip().capitalize())

kyc = pd.read_parquet('../data/processed/nodes.parquet')

In [7]:
# Function to find best match
def find_best_matches(name, ref_db, scorer):
    best = process.extractOne(name, ref_db, scorer=scorer)[1]
    return process.extractBests(name, ref_db, scorer=scorer, score_cutoff=best, limit=10)

In [8]:
# Normalize data (to lowercase in this example)
internet_names = [name.lower() for name in names]
kyc_names = [name.lower() for name in kyc.name]

# Finding matches
matches = {
    match[0]: {query, match[1]} 
    for query in tqdm(internet_names)
    for match in find_best_matches(query, kyc_names, fuzz.ratio)}

matches_sort_ratio = {
    match[0]: {query, match[1]} 
    for query in tqdm(internet_names)
    for match in find_best_matches(query, kyc_names, fuzz.token_sort_ratio)}

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 1252/1252 [44:52<00:00,  2.15s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 1252/1252 [1:22:22<00:00,  3.95s/it]


In [31]:
def set_to_dict(s: set):
    d = {}
    for item in s:
        if isinstance(item, int):
            d['score'] = item
        else:
            d['query'] = item
    return d

In [32]:
m = copy.deepcopy(matches)
m = {n: set_to_dict(s) for n,s in m.items()}

msort = copy.deepcopy(matches_sort_ratio)
msort = {n: set_to_dict(s) for n,s in msort.items()}

In [33]:
with open('m_temp.json', 'w') as f:
    json.dump(m,f)
    
with open('msort_temp.json', 'w') as f:
    json.dump(msort, f)

# Cleaning

In [41]:
names_to_remove = [
    'jeffrey wood',
    'edward walker',
    'gabrielle harper',
    'james hayes',
    'christy ford',
    'jeffery richards',
    'kara rodriguez',
    'joseph nicholson',
    'phillip lang',
    'frank johnson',
    'dr.keith fleming',
    'steven thomas',
    'joshua harvey',
    'carlos rodriguez',
    'robert brewer',
    'heather navarro',
    'christy miller',
    'christopher hall',
    'david paré',
    'pedro ramirez',
    'brandon stephenson',
    "kevin oneal",
    "lisa peterson",
    "victoria cuevos",
    "rose jackson",    
    ]

In [48]:
m = {n: v for n,v in m.items() if v['score'] > 90}
msort = {n: v for n,v in msort.items() if v['score'] > 90}
merged = m | msort 
merged = {n:v for n,v in merged.items() if v['query'] not in names_to_remove }
merged = dict(sorted(merged.items(), key=lambda item: item[1]['query']))

# I only want to keep queries with multiple matches
queries = [v['query'] for v in merged.values()]
duplicates = [item for item in queries if queries.count(item) > 1]
duplicates

# Filter and export the json
merged = {k:v for k,v in merged.items() if v['query'] in duplicates}
with open('multimatch.json', 'w') as f: 
    json.dump(merged, f)

In [30]:
m = {n: set_to_dict(s) for n,s in m.items()}
m = []

{'andrea francisca rangel verduzco': {'query': 'andrew stanley paul randy larry chickite curtis michael blaney ralph edward wilson',
  'score': 44},
 'john hooper': {'score': 75, 'query': 'john a. hofer'},
 'john hoover': {'score': 75, 'query': 'john a. hofer'},
 'paul francis': {'score': 73, 'query': 'paul francis smallboy'},
 'dr.vincent castillo': {'query': 'irvine scalplock', 'score': 63},
 'barbara payne phd': {'score': 59, 'query': 'bear paw pawn ltd.'},
 'armaan lad': {'score': 59, 'query': 'bear paw pawn ltd.'},
 'louis chartrand': {'query': 'gloria chartrand', 'score': 84},
 'shawn lawrence md': {'score': 80, 'query': 'shawn lawrence lamouche'},
 'jason gallagher': {'query': 'jan alsager', 'score': 77},
 'edwin johnson': {'score': 65, 'query': 'edwin j. naytowhow'},
 'anthony snow': {'query': 'anthony naytowhow', 'score': 76},
 'anthony nash': {'query': 'anthony naytowhow', 'score': 76},
 'dr.alexander camacho': {'query': 'lloyd alexander mcmahon', 'score': 79},
 'william blan

In [None]:
scraped = pd.read_csv('./traffic/incidents_scraped.csv')

# check articles
results = {}
for index, row in names.iterrows():
    hits = scraped[scraped['scrape_text'].str.contains(row['name'], case=False, na=False)]
    if not hits.empty:

        results[row['match']] = {}

        results[row['match']]['case_name'] = row['name']
        results[row['match']]['case_name_score'] = row['score']

        # context
        start_index = hits.iloc[0]['scrape_text'].lower().find(row['name'])
        start = max(start_index - 100, 0) 
        end = min(start_index + len(row['name']) + 100, len(hits.iloc[0]['scrape_text']))
        results[row['match']]['case_name_context'] = hits.iloc[0]['scrape_text'][start:end]

        results[row['match']]['sources'] = hits['Primary Source'].to_list()

In [None]:
with open('names_metadata.json', 'w') as file:
    json.dump(results, file, indent=4)

# Removing LE

In [None]:
# the context key in results provides a peek at 100 chars before and 100 chars after when the name was mentioned
# we went through these to make sure that the names were not of attorney generals, officers, etc
# though we tried to do this when sanitizing the LLM output, some names still slipped through

# DON'T RUN THIS CELL IF YOU DON'T WANT TO REMOVE THESE NAMES!


results_copy = results.copy()
for name in names_to_remove:
    try:
        del results_copy[name]

with open('names_metadata.json', 'w') as file:
    json.dump(results_copy, file, indent=4)