### imports

In [1]:
import requests

from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

import spacy
from spacy_cleaner import processing, Cleaner

### functions

In [2]:
def clean_text(df, col_name):
    
    # Instaiate spacy model
    model = spacy.load('en_core_web_sm')
    
    # Instaiate spacy cleaner
    cleaner = Cleaner( 
        model,
        processing.remove_stopword_token,
        processing.remove_punctuation_token,
        processing.remove_email_token,
        processing.remove_url_token,
        processing.mutate_lemma_token
    )
    
    cleaned_text_list = []
    
    for text in df[col_name]:
        
        if isinstance(text, str):
            text = text.lower()
            text = [text]
        
        if isinstance(text, list):
            cleaned_text = cleaner.clean(text)            
            cleaned_text = ''.join(cleaned_text)
            cleaned_text_list.append(cleaned_text)
        else:
            text = str(text)
            cleaned_text = cleaner.clean(text)
            cleaned_text = ''.join(cleaned_text)
            cleaned_text_list.append(cleaned_text)
            
    
    return cleaned_text_list

### webscrape DOJ press releases by date

In [3]:
# Classification model train/test data
start_date = '04/30/2023'
end_date = '04/30/2024'

base_url = f'https://www.justice.gov/news/press-releases?search_api_fulltext=+&start_date={start_date}&end_date={end_date}&sort_by=field_date'

#### get total pages

In [4]:
website = requests.get(base_url)
soup = BeautifulSoup(website.content, 'html.parser')

pagination = soup.find('ul', {'class': 'usa-pagination__list js-pager__items'})
pages = pagination.findChildren(recursive=False)

max_page_num = 0
# iterate <li> tags to get the max number of pages returned by date values
for i, page in enumerate(pages):
    if i == len(pages) - 1:
        a = page.find('a')['href']
        idx = a.index('page=')
        max_page_num = a[idx:].replace('page=', '')
        max_page_num = int(max_page_num)

#### scrape article summaries

In [5]:
feed_data = []

In [28]:
%%time
for i in range(max_page_num + 1):
    page_url = base_url + '&page=' + str(i)
    page = requests.get(page_url)
    
    soup = BeautifulSoup(page.content, 'html.parser')

    articles = soup.find('div', {'class': 'rows-wrapper'})
    articles = articles.findChildren(recursive=False)

    print(f'Parsing content for {"page " + str(i + 1)} of {max_page_num + 1}')
    for article in articles:
        title = article.find('a').text.strip()

        summary = article.find('p')
        if summary:
            summary = summary.text.strip()
        else:
            summary = np.nan
        
        url = 'https://www.justice.gov/' + article.find('a')['href']
        date = pd.to_datetime(article.find('time')['datetime']).date()

        # Append feed data objects
        feed_data.append({
            'article_title': title,
            'article_summary': summary,
            'article_url': url,
            'date_published': date
        })

In [7]:
df = pd.DataFrame(feed_data)
print(f'dataframe row count: {len(df)}')
df.head()

dataframe row count: 1504


Unnamed: 0,article_title,article_summary,article_url,date_published
0,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29
1,Readout of Pardon Attorney’s Outreach Efforts ...,"During the month of April, which is recognized...",https://www.justice.gov//opa/pr/readout-pardon...,2024-04-29
2,Two Former Missouri Health Care Charity Execut...,"Two former executives of a Springfield, Missou...",https://www.justice.gov//opa/pr/two-former-mis...,2024-04-29
3,Laboratory Marketer and North Carolina Physici...,"Laboratory marketer Thomas Anthony Carnaggio, ...",https://www.justice.gov//opa/pr/laboratory-mar...,2024-04-29
4,Former NSA Employee Sentenced to Over 21 Years...,"Jareh Sebastian Dalke, 32, of Colorado Springs...",https://www.justice.gov//opa/pr/former-nsa-emp...,2024-04-29


In [8]:
df['cleaned_title'] = clean_text(df, 'article_title')

Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 112.99it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 233.71it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 122.27it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 189.26it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 179.05it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 206.74it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 212.47it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 156.64it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 223.21it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 150.26it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 227.89it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 142.73it/s]
Cleaning Progress: 100%|████

In [9]:
df['cleaned_summary'] = clean_text(df, 'article_summary')

Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 123.23it/s]
Cleaning Progress: 100%|██████████████████████████| 1/1 [00:00<00:00, 55.08it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 102.62it/s]
Cleaning Progress: 100%|██████████████████████████| 1/1 [00:00<00:00, 60.06it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 119.19it/s]
Cleaning Progress: 100%|██████████████████████████| 1/1 [00:00<00:00, 81.73it/s]
Cleaning Progress: 100%|██████████████████████████| 1/1 [00:00<00:00, 90.25it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 123.67it/s]
Cleaning Progress: 100%|█████████████████████████| 3/3 [00:00<00:00, 821.71it/s]
Cleaning Progress: 100%|██████████████████████████| 1/1 [00:00<00:00, 95.44it/s]
Cleaning Progress: 100%|██████████████████████████| 1/1 [00:00<00:00, 12.76it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 120.49it/s]
Cleaning Progress: 100%|████

In [11]:
df['cleaned_title_summary'] = df['cleaned_title'] + " " + df['cleaned_summary']

In [12]:
df.drop(columns=['cleaned_title', 'cleaned_summary'], inplace=True)

df.head()

Unnamed: 0,article_title,article_summary,article_url,date_published,cleaned_title_summary
0,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,justice department support competition low pri...
1,Readout of Pardon Attorney’s Outreach Efforts ...,"During the month of April, which is recognized...",https://www.justice.gov//opa/pr/readout-pardon...,2024-04-29,readout pardon attorney outreach effort second...
2,Two Former Missouri Health Care Charity Execut...,"Two former executives of a Springfield, Missou...",https://www.justice.gov//opa/pr/two-former-mis...,2024-04-29,missouri health care charity executive sentenc...
3,Laboratory Marketer and North Carolina Physici...,"Laboratory marketer Thomas Anthony Carnaggio, ...",https://www.justice.gov//opa/pr/laboratory-mar...,2024-04-29,laboratory marketer north carolina physician a...
4,Former NSA Employee Sentenced to Over 21 Years...,"Jareh Sebastian Dalke, 32, of Colorado Springs...",https://www.justice.gov//opa/pr/former-nsa-emp...,2024-04-29,nsa employee sentence 21 year prison attempt e...


### annote training/test dataset

In [14]:
compliance_df = pd.read_csv('compliance words corpus.csv')

In [15]:
compliance_df['cleaned_terms'] = clean_text(compliance_df, 'terms')

Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 374.09it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 401.75it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 288.21it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 324.76it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 258.52it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 345.78it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 338.99it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 332.78it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 339.24it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 301.03it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 293.27it/s]
Cleaning Progress: 100%|█████████████████████████| 1/1 [00:00<00:00, 364.69it/s]
Cleaning Progress: 100%|████

In [27]:
compliance_terms = compliance_df['terms'].tolist()
cleaned_compliance_terms = compliance_df['cleaned_terms'].tolist()

print('Compliance terms sample:')
print(compliance_terms[:5])
print()
print('Compliance (cleaned) terms sample:')
print(cleaned_compliance_terms[:5])

Compliance terms sample:
['embezzlement', 'fraud', 'insider trading', 'money laundering', 'ponzi scheme']

Compliance (cleaned) terms sample:
['embezzlement', 'fraud', 'insider trading', 'money laundering', 'ponzi scheme']


In [18]:
compliance_terms = compliance_terms + cleaned_compliance_terms

In [19]:
df['label'] = df['cleaned_title_summary'].str.contains('|'.join(compliance_terms))

In [20]:
df['label'].value_counts()

label
False    856
True     648
Name: count, dtype: int64

In [21]:
df[df['label'] == True].head()

Unnamed: 0,article_title,article_summary,article_url,date_published,cleaned_title_summary,label
2,Two Former Missouri Health Care Charity Execut...,"Two former executives of a Springfield, Missou...",https://www.justice.gov//opa/pr/two-former-mis...,2024-04-29,missouri health care charity executive sentenc...,True
3,Laboratory Marketer and North Carolina Physici...,"Laboratory marketer Thomas Anthony Carnaggio, ...",https://www.justice.gov//opa/pr/laboratory-mar...,2024-04-29,laboratory marketer north carolina physician a...,True
4,Former NSA Employee Sentenced to Over 21 Years...,"Jareh Sebastian Dalke, 32, of Colorado Springs...",https://www.justice.gov//opa/pr/former-nsa-emp...,2024-04-29,nsa employee sentence 21 year prison attempt e...,True
5,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,justice department recover fraudulent transfer...,True
8,Doctor Convicted for $5.4M Medicare Fraud Scheme,,https://www.justice.gov//opa/pr/doctor-convict...,2024-04-26,doctor convict $ 5.4 m medicare fraud scheme nn,True


In [22]:
df[df['label'] == False].head()

Unnamed: 0,article_title,article_summary,article_url,date_published,cleaned_title_summary,label
0,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,justice department support competition low pri...,False
1,Readout of Pardon Attorney’s Outreach Efforts ...,"During the month of April, which is recognized...",https://www.justice.gov//opa/pr/readout-pardon...,2024-04-29,readout pardon attorney outreach effort second...,False
6,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,justice department secure agreement resolve se...,False
7,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,attorney general merrick b. garland statement ...,False
9,Williams-Sonoma Ordered to Pay Record Civil Pe...,"The Justice Department, together with the Fede...",https://www.justice.gov//opa/pr/williams-sonom...,2024-04-26,williams sonoma order pay record civil penalty...,False


In [24]:
# df.to_csv('doj_data.csv', index=False)
df.to_csv('doj_data.csv', index=False)