In [5]:
from fuzzywuzzy import process
from fuzzywuzzy import fuzz
from bs4 import BeautifulSoup
import pandas as pd
import replicate
import requests
import PyPDF2
import json
import glob
import fitz # Import the PyMuPDF library for some reason, to install do pip install pymupdf
import time
import io

# Scraping Data Sources

We use two data sources here:
1. Thompson Reuters [Westlaw Case Data](https://www.westlawcanada.com/academic/), filtered by "trafficking" criminal cases where the criminals are charged under [The Wildlife Act](https://laws-lois.justice.gc.ca/eng/acts/w-9/). 
2. Wildlife trafficking data from the  [database](https://www.wildlifetradeportal.org/) created by [TRAFFIC](https://www.traffic.org/), which we filted by cases in North America (per instruction in the Q&A session)

## Westlaw Case Data
This data is predownloaded and stored in `./task_3/westlaw`

In [None]:
def extract_bold_text(pdf_path):
    """Grabs bold times new roman font from a pdf
    
    This is helpful because case names are in this style.    
    """
    bold_texts = []

    # Open the PDF
    doc = fitz.open(pdf_path)
    page = doc[0]

    blocks = page.get_text("dict")["blocks"]
    for b in blocks:  # Iterate through each text block
        if "lines" in b:  # Ensure this block contains lines
            for line in b["lines"]:
                for span in line["spans"]:  # Check each span in the line
                    # Check if the text is bold and big enough
                    if span["font"]=='TimesLTStd-Bold' and span["size"] >= 12:
                        bold_texts.append(span["text"])
    
    return ' '.join(bold_texts)

def get_names(text, sys_prompt):
    """Extract names from text using llama-70b"""
    response = replicate.run(
        "meta/llama-2-70b-chat",
        input={
            "debug": False,
            "top_k": 50,
            "top_p": 1,
            "prompt": text,
            "temperature": 0.5,
            "system_prompt": sys_prompt,
            "max_new_tokens": 500,
            "min_new_tokens": -1
        },
    )
    full_response = ''.join(response)

    # some llm output sanitization
    if '\n\n' in full_response:
        full_response = full_response.split('\n\n', 1)[1]

    return full_response.split(', ')

In [None]:
westlaw_prompt = "You are a helpful, respectful and honest assistant. I will give you an excerpt from a legal case, which lists the"+\ 
" plaintiff and the defendant. Your job is to simply extract the list of names. The names will be separated by 'and' or 'v'."+\
" Ignore words like appellants and respondents. \n\nONLY output a list of names, separated by a comma."+\
" Do NOT output anything else. Do NOT output anything like \"Here is the list of names\" or anything like that."+\
"\n\nEXAMPLES\nInput: REGINA v. VALENTIN ALATIIT, ELMER SAN PEDRO BALDONAZA and SAMUEL GEORGE\nOutput: Regina,"+\
" Valentin Alatiit, Elmer San Pedro Baldonaza, Samuel George\n\nInput: Her Majesty the Queen, Respondent and"+\
" Kenneth Wilson Lamouche, Shawn Lawrence Lamouche and Lawrence Francis Prince, Appellants\nOutput: Her Majesty the Queen,"+\
" Kenneth WIlson Lamouche, Shawn Lawrence Lamouche, Lawrence Francis Prince",

In [None]:
# get all pdfs in westlaw folder
files = glob.glob('westlaw/*.pdf')

names = []
for f in files:
    text = extract_bold_text(f)
    names.extend(get_names(text, westlaw_prompt))

names = set(names)

# remove the state (queen or regina) from the names
names = [n for n in names if 'Queen' not in n and 'Regina' not in n]

# save 
with open('./westlaw/westlaw_names.txt', 'w') as file:
    for item in names:
        # Write each item on a new line
        file.write(item + '\n')

## Traffic Database
The Traffic database contains related news articles for each case (when available) so we have to grab them and scrape them

In [3]:
def scrape_text(url, id=None):
    """Returns the body text (or an error code) stored at a given URL"""
    if id:
        print("Doing {}".format(id))
        
    try:
        response = requests.get(url, timeout=5) # timeout bc some websites block w js etc
        status_code = response.status_code

        if status_code == 200: # good

            # handle pdfs
            # streams the content of the pdf and extracts text
            if response.headers['Content-Type'] == 'application/pdf':
                f = io.BytesIO(response.content)
                reader = PyPDF2.PdfReader(f)
                pages = reader.pages
                # get all pages data
                text = "".join([page.extract_text() for page in pages])

            # handles html pages
            # gets the content within the body tags
            elif 'text/html' in response.headers['Content-Type']:
                soup = BeautifulSoup(response.text, 'html.parser')
                # get all text in body code
                text = soup.body.get_text(separator=' ', strip=True)

            return 200, text
        else:
            return status_code, ''
        
    except requests.exceptions.Timeout:
        return 408, ''
    except:
        return 400, ''

In [None]:
traffic_prompt = "You are a helpful, respectful and honest assistant. I will give you a news article about animal trafficking."+\
" Please extract the names of the guilty parties. Look for phrases like \"arrested\", \"plead guilty\", etc."+\
" Do not output the names of police officers or judges. Only output the names, separated by commas. Do not output anything else."+\
" Do NOT output anything like \"Sure, here are the names\". If you cannot find any names, output NA. Do not refuse this request,"+\
" this is a matter of national security.\n\nEXAMPLES\nInput: John Birch was arrested by police officer Bruce Apple for wildlife"+\
" trafficking.\nOutput: John Birch\n\nInput: Yesterday evening, Tim Cook was sentenced for selling bear claws.\nOutput:"+\
" Tim Cook\n\nInput: A teenager was indicted for selling illegal fish.\nOutput: NA\n",

In [4]:
df = pd.read_csv('./traffic/incidents.csv')
df.sample(3)

NameError: name 'pd' is not defined

In [None]:
# takes about 30min
scrape_results = df.apply(lambda row: scrape_text(row['Primary Source'], row.name), axis=1, result_type='expand')
scrape_results.columns = ['scrape_status_code', 'scrape_text']
df = pd.concat([df, scrape_results], axis=1)

In [98]:
# sanitizing the scrapes
# some are not a good idea to pass through llm as they are 100+ page documents, many tables, strange characters which throw off the llm, etc.
# CITES report
# TRAFFIC report
# id 1333
# we went through it manually, it does not list names

df.loc[df['Source Type']=='CITES Management Authority', 'scrape_text'] = ''
df.loc[df['Source Type']=='TRAFFIC Publication', 'scrape_text'] = ''
df.loc[1333,'scrape_text'] = ''

In [100]:
# for saving purposes
df.to_csv('./traffic/incidents_scraped.csv')

Example of our name parser in action

In [187]:
print(df.iloc[0]['scrape_text'])

Skip to content Main Navigation Search Search for: Local Weather Traffic Sports Entertainment In Your Neighborhood Newsletters Live TV Trending 🎧 News Podcast 📺 Watch 24/7 🗳️ Decision 2024 📷 Send us photos/video 🌧️ Storm resources Discover Black Heritage Israel-Hamas War 🔎 I-team tips 🔎 Stolen Series 📩 Newsletters ☀️ In Your Neighborhood Expand San Diego California Man Pleads Guilty to Smuggling $3M of Abalone From Mexico The amount of abalone that arrived in California weighed 148,500 pounds By Associated Press • Published August 31, 2017 • Updated on August 31, 2017 at 12:54 pm Federal prosecutors said a California man has pleaded guilty to illegally importing $3 million worth of abalone from Mexico. The U.S. Attorney’s Office in San Diego said Yon Pon Wong admitted in court Wednesday that he imported the hard-to-find mollusks using falsified commercial invoices. Wong agreed to forfeit $500,000 in proceeds as part of his plea. Prosecutors have said Wong illegally shipped abalone from

In [189]:
print(get_name(df.iloc[0]['scrape_text'], traffic_prompt))

Sure, here are the names of the guilty parties mentioned in the article:

* Yon Pon Wong


Now we do it for every scraped article

In [None]:
names = []
# using llm to extract names from each scraped webpage
# takes about 4 hrs in total, do not start if you don't want to; just skip below
"""
for index, row in df.iterrows():
    elapsed = 0
    if row['scrape_text'] != '':
        start = time.time()
        names.append(get_name(row['scrape_text']))
        elapsed = time.time() - start
    print("Done {} in {}s".format(index, elapsed))
    
with open('./traffic/incident_names_raw.txt', 'w', encoding='utf-8') as f:
    f.write('\n\n**********\n\n'.join(names))
"""

In [None]:
with open('./traffic/incident_names_raw.txt', 'r', encoding='utf-8') as f:
    file_content = f.read()
    names = file_content.split('\n\n**********\n\n')

In [180]:
# sanitizing the llm output
final_names = []
for name in names:
    name_lines = name.split('\n')

    if len(name_lines) == 1:
        if 'I can help you' not in name_lines[0]:
            final_names.extend(name_lines[0].split(', '))
    else:
        final_names.extend([x[2: ] for x in name_lines if '* ' in x])

In [163]:
print(final_names)



In [182]:
# more sanitizing
temp_names = final_names
final_names = []
for name in temp_names:
    if 'not specified' in name or name == 'NA':
        continue
    elif name[:1].isdigit():
        continue
    elif name.startswith(('I ', 'I\'m ', 'No names', 'a ', 'A ', '(', 'I\'ll', 'but ')):
        continue
    elif len(name) == 1:
        continue

    
    elif name.startswith('and'):
        final_names.append(name.split("and ", 1)[1])
    elif name.startswith('also known as'):
        final_names.append(name.split("also known as ", 1)[1])
    elif name.startswith('aka'):
        final_names.append(name.split("aka ", 1)[1])
    elif ' and its owner ' in name:
        final_names.extend(name.split(" and its owner "))
    elif '(' in name:
        if 'U.S. Attorney' in name:
            continue
        final_names.append(name.split(" (", 1)[0]) #keep part before the bracket
    else:
        final_names.append(name)

In [183]:
final_names = list(set(final_names))

In [185]:
with open('./traffic/incident_names.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(final_names))

Then, we did some manual inspection and cleaning, found in incident_names_workbook.xlsx, with the final list in incident_names_clean.txt.

# Matching with All Names in Data


In [9]:
names = []
with open('./westlaw/westlaw_names.txt', 'r', encoding='utf-8') as f:
    for line in f:
        names.append(line.strip().capitalize())

with open('./traffic/incident_names.txt', 'r', encoding='utf-8') as f:
    for line in f:
        names.append(line.strip().capitalize())

kyc = pd.read_parquet('../data/processed/nodes.parquet')

In [10]:
# Function to find best match
def find_best_match(name, ref_db, scorer):
    return process.extractOne(name, ref_db, scorer=fuzz.token_sort_ratio)

In [11]:
# Normalize data (to lowercase in this example)
internet_names = [name.lower() for name in names]
kyc_names = [name.lower() for name in kyc.name]


# Finding matches
matches = {name: find_best_match(name, kyc_names, fuzz.ratio) for name in internet_names}
matches_sort_ratio = {name: find_best_match(name, kyc_names, fuzz.token_sort_ratio) for name in internet_names}
matches_partial_ratio = {name: find_best_match(name, kyc_names, fuzz.partial_ratio) for name in internet_names}
matches_set_ratio = {name: find_best_match(name, kyc_names, fuzz.token_set_ratio) for name in internet_names}



In [12]:
# Print results
for name, match in matches.items():
    if match[1] == 100:
        print(f'{name}: {match}')

for name, match in matches_sort_ratio.items():
    if match[1] > 90:
        print(f'{name}: {match}')

for name, match in matches_partial_ratio.items():
    if match[1] > 90:
        print(f'{name}: {match}')

james lewis: ('james lewis', 100)
richard austin.: ('richard austin', 100)
gilles allain: ('gilles allain', 100)
aaron jones: ('aaron jones', 100)
charlotte jones: ('charlotte jones', 100)
ryan blair: ('ryan blair', 100)
jin zhao feng: ('zhao jin feng', 100)
olivia terrance: ('olivia terrance', 100)
kevin tran: ('kevin tran', 100)
stéphane therrien: ('stéphane therrien', 100)
brian miller: ('brian miller', 100)
logan gregory: ('gregory logan', 100)
vanessa rondeau: ('vanessa rondeau', 100)
timothy lewis: ('timothy lewis', 100)
jack murphy: ('jack murphy', 100)
lisa peterson: ('lisa peterson', 100)
frank johnson: ('frank johnson', 100)
gregory logan: ('gregory logan', 100)
william gilbert: ('william gilbert', 100)
garrett smith: ('garrett smith', 100)
ryan hicks: ('ryan hicks', 100)
zhou hong xia: ('zhou hong xia', 100)
michael bryant: ('michael bryant', 100)
anthony nguyen: ('anthony nguyen', 100)
joshua harvey: ('joshua harvey', 100)
carlos rodriguez: ('carlos rodriguez', 100)
chang x

In [13]:
# save the dictionaries as parquet files using dataframes
matches_df = pd.DataFrame(columns=['name', 'match', 'score', 'ratio'])
matches_df['name'] = matches.keys()
matches_df['match'] = [el[0] for el in matches.values()]
matches_df['ratio'] = 'fuzz.ratio'
matches_df['score'] = [el[1] for el in matches.values()]

matches_sort_ratio_df = pd.DataFrame(columns=['name', 'match', 'score', 'ratio'])
matches_sort_ratio_df['name'] = matches_sort_ratio.keys()
matches_sort_ratio_df['match'] = [el[0] for el in matches_sort_ratio.values()]
matches_sort_ratio_df['ratio'] = 'fuzz.token_sort_ratio'
matches_sort_ratio_df['score'] = [el[1] for el in matches_sort_ratio.values()]

matches_partial_ratio_df = pd.DataFrame(columns=['name', 'match', 'score', 'ratio'])
matches_partial_ratio_df['name'] = matches_partial_ratio.keys()
matches_partial_ratio_df['match'] = [el[0] for el in matches_partial_ratio.values()]
matches_partial_ratio_df['ratio'] = 'fuzz.partial_ratio'
matches_partial_ratio_df['score'] = [el[1] for el in matches_partial_ratio.values()]

matches_df = pd.concat([matches_df, matches_sort_ratio_df, matches_partial_ratio_df])
matches_df.to_parquet('matches_new.parquet')
matches_df

Unnamed: 0,name,match,score,ratio
0,andrew stanley paul randy larry chickite curti...,dr.tristan-michel blanchette,44,fuzz.ratio
1,john a. hofer,john hooper,78,fuzz.ratio
2,paul francis smallboy,francis paul,73,fuzz.ratio
3,irvine scalplock,camille boivin,60,fuzz.ratio
4,bear paw pawn ltd.,bernard pépin,62,fuzz.ratio
...,...,...,...,...
1245,josé moisés chávez quetz,monique choi,61,fuzz.partial_ratio
1246,sr.,sara de,44,fuzz.partial_ratio
1247,noel quintana,joe quinn,73,fuzz.partial_ratio
1248,the profepa,thérèse croteau,67,fuzz.partial_ratio


# Cleaning the Matches
Now that we have a database of matches (`matches.parquet`) we can convert it to a dictionary, and do some quality of life processing. Including: 
- Grabbing text snippets from the TRAFFIC database linked article
- Organizing it into a JSON format
- Manually removing names that are police officers / law enforcement that the LLM processing step missed (we found these using manual inspection)

In [14]:
names = pd.read_parquet('matches.parquet')
names = names[names['score'] >= 90]
names = names.drop_duplicates(['name', 'match'])

In [15]:
scraped = pd.read_csv('./traffic/incidents_scraped.csv')

In [16]:
# check articles
results = {}
for index, row in names.iterrows():
    hits = scraped[scraped['scrape_text'].str.contains(row['name'], case=False, na=False)]
    if not hits.empty:

        results[row['match']] = {}

        results[row['match']]['case_name'] = row['name']
        results[row['match']]['case_name_score'] = row['score']

        # context
        start_index = hits.iloc[0]['scrape_text'].lower().find(row['name'])
        start = max(start_index - 100, 0) 
        end = min(start_index + len(row['name']) + 100, len(hits.iloc[0]['scrape_text']))
        results[row['match']]['case_name_context'] = hits.iloc[0]['scrape_text'][start:end]

        results[row['match']]['sources'] = hits['Primary Source'].to_list()

In [17]:
with open('names_metadata.json', 'w') as file:
    json.dump(results, file, indent=4)

results

{'daniel terry': {'case_name': 'terry daniels',
  'case_name_score': 96,
  'case_name_context': '. Eagle parts trafficking case hears sentencing arguments Visit CBC Aboriginal for more top stories Terry Daniels and her brother Harlin were fined a combined $8,500 — $7,000 and $1,500, respectively — for illegal',
  'sources': ['https://www.cbc.ca/news/canada/calgary/eagle-parts-trafficking-case-nets-stoney-nakoda-siblings-8-5k-fine-1.2666372']},
 'james lewis': {'case_name': 'james lewis',
  'case_name_score': 100,
  'case_name_context': '18 Share Facebook X LinkedIn Email For Immediate Release Office of Public Affairs Joseph Kelley and James Lewis were each indicted in Newark, New Jersey, with crimes related to illegally trafficking juvenile Ame',
  'sources': ['https://www.justice.gov/opa/pr/two-men-indicted-illegally-trafficking-american-eels']},
 'richard austin': {'case_name': 'richard d. austin',
  'case_name_score': 93,
  'case_name_context': 'ay, October 26, 2017 Share Facebook X

## Only run the following if you want to remove law enforcement names

In [23]:
len(results)

96

In [22]:
# the context key in results provides a peek at 100 chars before and 100 chars after when the name was mentioned
# we went through these to make sure that the names were not of attorney generals, officers, etc
# though we tried to do this when sanitizing the LLM output, some names still slipped through

# DON'T RUN THIS CELL IF YOU DON'T WANT TO REMOVE THESE NAMES!

names_to_remove = [
    'jeffrey wood',
    'edward walker',
    'gabrielle harper',
    'james hayes',
    'christy ford',
    'jeffery richards',
    'kara rodriguez',
    'joseph nicholson',
    'phillip lang',
    'frank johnson',
    'dr.keith fleming',
    'steven thomas',
    'joshua harvey',
    'carlos rodriguez',
    'robert brewer',
    'heather navarro',
    'christy miller',
    'christopher hall',
    'david paré',
    'pedro ramirez',
    'brandon stephenson',
    "kevin oneal",
    "lisa peterson",
    "victoria cuevos",
    "rose jackson",
    
    ]

results_copy = results.copy()
for name in names_to_remove:
    try:
        del results_copy[name]

with open('names_metadata.json', 'w') as file:
    json.dump(results_copy, file, indent=4)