In [2]:
import pandas as pd
import json

In [3]:
names = pd.read_parquet('matches.parquet')
names = names[names['score'] >= 90]
names = names.drop_duplicates(['name', 'match'])

In [14]:
names

Unnamed: 0,name,match,score,ratio,source
28,terry daniels,daniel terry,96,fuzz.ratio,
34,james lewis,james lewis,100,fuzz.ratio,
39,richard austin.,richard austin,100,fuzz.ratio,
54,jeffrey h. wood,dr.jeffrey wood,90,fuzz.ratio,
55,edward o. walker,edward walker,93,fuzz.ratio,
...,...,...,...,...,...
1188,gérald thériault,gérard thériault,93,fuzz.ratio,
1196,kiem tran,kim tran,94,fuzz.ratio,
1200,liang chen,chen liang,100,fuzz.ratio,
1236,brad stephenson,brandon stephenson,91,fuzz.ratio,


In [4]:
scraped = pd.read_csv('incidents_scraped.csv')

In [39]:
# check articles
results = {}
for index, row in names.iterrows():
    hits = scraped[scraped['scrape_text'].str.contains(row['name'], case=False, na=False)]
    if not hits.empty:

        results[row['match']] = {}

        results[row['match']]['case_name'] = row['name']
        results[row['match']]['case_name_score'] = row['score']

        # context
        start_index = hits.iloc[0]['scrape_text'].lower().find(row['name'])
        start = max(start_index - 100, 0) 
        end = min(start_index + len(row['name']) + 100, len(hits.iloc[0]['scrape_text']))
        results[row['match']]['case_name_context'] = hits.iloc[0]['scrape_text'][start:end]

        results[row['match']]['sources'] = hits['Primary Source'].to_list()

In [40]:
results

{'daniel terry': {'case_name': 'terry daniels',
  'case_name_score': 96,
  'case_name_context': '. Eagle parts trafficking case hears sentencing arguments Visit CBC Aboriginal for more top stories Terry Daniels and her brother Harlin were fined a combined $8,500 — $7,000 and $1,500, respectively — for illegal',
  'sources': ['https://www.cbc.ca/news/canada/calgary/eagle-parts-trafficking-case-nets-stoney-nakoda-siblings-8-5k-fine-1.2666372']},
 'james lewis': {'case_name': 'james lewis',
  'case_name_score': 100,
  'case_name_context': '18 Share Facebook X LinkedIn Email For Immediate Release Office of Public Affairs Joseph Kelley and James Lewis were each indicted in Newark, New Jersey, with crimes related to illegally trafficking juvenile Ame',
  'sources': ['https://www.justice.gov/opa/pr/two-men-indicted-illegally-trafficking-american-eels']},
 'richard austin': {'case_name': 'richard d. austin',
  'case_name_score': 93,
  'case_name_context': 'ay, October 26, 2017 Share Facebook X

In [41]:
with open('names_metadata.json', 'w') as file:
    json.dump(results, file, indent=4)

In [42]:
# the context key in results provides a peek at 100 chars before and 100 chars after when the name was mentioned
# we went through these to make sure that the names were not of attorney generals, officers, etc
# though we tried to do this when sanitizing the LLM output, some names still slipped through

# DON'T RUN THIS CELL IF YOU DON'T WANT TO REMOVE THESE NAMES!

names_to_remove = [
    'dr.jeffrey wood',
    'edward walker',
    'gabrielle harper',
    'james hayes',
    'christy ford',
    'jeffery richards',
    'kara rodriguez',
    'jerome nicholson',
    'phillip lang',
    'frank johnson',
    'dr.keith fleming',
    'steven thomas',
    'joshua harvey',
    'carlos rodriguez',
    'robert brewer',
    'heather navarro',
    'christy miller',
    'christopher hale',
    'david paré',
    'pedro ramirez',
    'brandon stephenson'

    ]

results_copy = results.copy()
for name in names_to_remove:
    del results_copy[name]

with open('names_metadata.json', 'w') as file:
    json.dump(results_copy, file, indent=4)