### Imports

In [1]:
import regex as re
import unicodedata

import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
english_stopwords = set(stopwords.words('english'))

from tqdm.notebook import trange, tqdm

[nltk_data] Downloading package wordnet to /Users/wes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/wes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/wes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Functions

#### scrape DOJ press releases

In [2]:
def doj_base_url(start_date, end_date) -> str:
    base_url = f'https://www.justice.gov/news/press-releases?search_api_fulltext=+&start_date={start_date}&end_date={end_date}&sort_by=field_date'
    
    return base_url

In [3]:
def doj_pagination_counter(base_url):
    
    website = requests.get(base_url)
    soup = BeautifulSoup(website.content, 'html.parser')
    
    pagination = soup.find('ul', {'class': 'usa-pagination__list js-pager__items'})
    pages = pagination.findChildren(recursive=False)
    
    max_page_num = 0
    # iterate <li> tags to get the max number of pages returned by date values
    for i, page in enumerate(pages):
        if i == len(pages) - 1:
            a = page.find('a')['href']
            idx = a.index('page=')
            max_page_num = a[idx:].replace('page=', '')
            max_page_num = int(max_page_num)

    return max_page_num

In [4]:
def get_doj_press_releases(base_url, max_page_num):

    # Intialize list
    feed_data = []

    # Iterate press release pages
    for i in trange(max_page_num + 1, desc='retrieving summaries'):
        page_url = base_url + '&page=' + str(i)
        page = requests.get(page_url)

        # Instantiate bs4 object
        soup = BeautifulSoup(page.content, 'html.parser')
        
        # find summary tag(s)
        articles = soup.find('div', {'class': 'rows-wrapper'})
        articles = articles.findChildren(recursive=False)

        # Get article title and summaries
        for article in articles:

            title = article.find('a').text.strip()    
            summary = article.find('p')
            
            if summary:
                summary = summary.text.strip()
            else:
                summary = np.nan
            
            url = 'https://www.justice.gov/' + article.find('a')['href']
            date = pd.to_datetime(article.find('time')['datetime']).date()
    
            # Append feed data objects
            feed_data.append({
                'article_title': title,
                'article_summary': summary,
                'article_url': url,
                'date_published': date
            })

    df = pd.DataFrame(feed_data)
    print(f'Number of press releases: {len(df)}')
    
    return df

#### clean text

In [5]:
def clean_text(doc):
    
    # normalize Text
    doc = doc.lower()

    # remove unnecessary whitespaces
    doc = re.sub('\s+', ' ', doc)
    doc = doc.strip()
    
    # remove html tags
    doc = re.sub('<.*?>', '', doc)
    
    # remove email addresses
    doc = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', '', doc)
    
    # remove url
    doc = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', doc)
    
    # remove accented characters
    doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # remove special symbols/punctuation
    doc = re.sub(r'[^\w ]+', ' ', doc)

    # remove stopwords
    doc = ' '.join([word.strip() for word in doc.split() if word not in english_stopwords])

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    doc = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(doc)])

    return doc

### DOJ press releases

### scrape dataset

In [6]:
# Declare scraping dates
start_date = '01/01/2020'
end_date = '04/30/2024'

In [7]:
# Function calls to get DOJ press release scrape parameters
base_url = doj_base_url(start_date, end_date)
max_page_num = doj_pagination_counter(base_url)

In [8]:
# Function call to scrape DOj press releases
df = get_doj_press_releases(base_url, max_page_num)
df.head()

retrieving summaries:   0%|          | 0/505 [00:00<?, ?it/s]

Number of press releases: 6052


Unnamed: 0,article_title,article_summary,article_url,date_published
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29
3,Former NSA Employee Sentenced to Over 21 Years...,"Jareh Sebastian Dalke, 32, of Colorado Springs...",https://www.justice.gov//opa/pr/former-nsa-emp...,2024-04-29
4,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29


### Working dataframe

In [70]:
train_df = df.copy()

In [71]:
# Combine title and summary
train_df['title_summary'] = train_df['article_title'].astype(str) + " " + train_df['article_summary'].astype(str)

# Function call to clean itle and summary text
train_df['cleaned_title_summary'] = train_df['title_summary'].apply(clean_text)
train_df.head()

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,Attorney General Merrick B. Garland Statement ...,attorney general merrick b garland statement s...
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Recovers Fraudulent Transfe...,justice department recovers fraudulent transfe...
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Secures Agreement to Resolv...,justice department secures agreement resolve s...
3,Former NSA Employee Sentenced to Over 21 Years...,"Jareh Sebastian Dalke, 32, of Colorado Springs...",https://www.justice.gov//opa/pr/former-nsa-emp...,2024-04-29,Former NSA Employee Sentenced to Over 21 Years...,former nsa employee sentenced 21 year prison a...
4,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,The Justice Department Supports More Competiti...,justice department support competition lower p...


### Annote training/test dataset

In [72]:
# Read compliance corpus csv
compliance_df = pd.read_csv('compliance words/complinace_corpus_less.csv')

# Clearn compliance corpus terms
compliance_df['compliance_terms'] = compliance_df['compliance_terms'].apply(clean_text)
compliance_df.head()

Unnamed: 0,compliance_terms
0,accounting
1,accounting fraud
2,antitrust
3,asset misappropriation
4,asset stripping


In [73]:
# Convert compliance terms to list
compliance_terms = compliance_df['compliance_terms'].tolist()

In [74]:
# Check cleaned DOJ press release text for compliance terms
train_df['label'] = train_df['cleaned_title_summary'].str.contains('|'.join(compliance_terms))

# Count labels 
train_df['label'].value_counts()

label
True     3121
False    2931
Name: count, dtype: int64

In [75]:
# Get non compliance corpus terms
not_compliance_df = pd.read_csv('compliance words/not_complinace_corpus.csv')
not_compliance_df['not_compliance_terms'] = not_compliance_df['not_compliance_terms'].apply(clean_text)

not_compliance_terms = not_compliance_df['not_compliance_terms'].tolist()

print(not_compliance_terms)

['traffic', 'trafficking']


In [76]:
# Label non compliance data
train_df['modify_label'] = train_df['cleaned_title_summary'].str.contains('|'.join(not_compliance_terms))

train_df.loc[train_df['modify_label'] == True, 'label'] = False

# train_df['label'] = train_df.apply(lambda row: False if row['modify_label'] == True else row['label'], axis=1)
train_df.head()

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label,modify_label
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,Attorney General Merrick B. Garland Statement ...,attorney general merrick b garland statement s...,False,False
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Recovers Fraudulent Transfe...,justice department recovers fraudulent transfe...,True,False
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Secures Agreement to Resolv...,justice department secures agreement resolve s...,False,False
3,Former NSA Employee Sentenced to Over 21 Years...,"Jareh Sebastian Dalke, 32, of Colorado Springs...",https://www.justice.gov//opa/pr/former-nsa-emp...,2024-04-29,Former NSA Employee Sentenced to Over 21 Years...,former nsa employee sentenced 21 year prison a...,False,False
4,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,The Justice Department Supports More Competiti...,justice department support competition lower p...,True,False


In [77]:
# Count labels 
train_df['label'].value_counts()

label
True     3038
False    3014
Name: count, dtype: int64

In [78]:
# Drop label modifier column
train_df.drop(columns=['modify_label'], inplace=True)

train_df.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,Attorney General Merrick B. Garland Statement ...,attorney general merrick b garland statement s...,False
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Recovers Fraudulent Transfe...,justice department recovers fraudulent transfe...,True
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Secures Agreement to Resolv...,justice department secures agreement resolve s...,False


#### Training Data

In [79]:
# Export train/test data to csv
data_file = 'data/doj_data.csv'

train_df.to_csv(data_file, index=False)

#### Inference Data

In [80]:
# # Export train/test data to csv
# data_file = ''

# train_df.to_csv(data_file, index=False)