#### imports

In [24]:
import time

import pandas as pd
import numpy as np

import requests
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import spacy
from spacy_cleaner import processing, Cleaner

import torch
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split

import torchtext
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

print(f'PyTorch version: {torch.__version__}')

PyTorch version: 2.3.0


In [2]:
if torch.backends.mps.is_available():
    dtype = torch.float
    device = 'cpu'
    
    if torch.backends.mps.is_available():
        if torch.backends.mps.is_built():
            device = torch.device('mps')      
            print("Device set to 'mps'")
            device = 'cpu'
        
    elif torch.backends.cuda.is_available():
        if torch.backends.cuda.is_built():
            device = torch.device('cuda')
            print("Device set to 'cuda'")
    else:
        print ("No gpu device found")

Device set to 'mps'


#### functions

In [3]:
def clean_text(text):
    
    # Instaiate spacy model
    model = spacy.load('en_core_web_sm')
    
    # Instaiate spacy cleaner
    cleaner = Cleaner( 
        model,
        processing.remove_stopword_token,
        processing.remove_punctuation_token,
        processing.remove_email_token,
        processing.remove_url_token,
        processing.mutate_lemma_token
    )
    
    cleaned_text_list = []
        
    if isinstance(text, str):
        text = text.lower()
        text = [text]
    
    if isinstance(text, list):
        cleaned_text = cleaner.clean(text)            
        cleaned_text = ''.join(cleaned_text)
        cleaned_text_list.append(cleaned_text)
    else:
        text = str(text)
        cleaned_text = cleaner.clean(text)
        cleaned_text = ''.join(cleaned_text)
        cleaned_text_list.append(cleaned_text)
            
    
    return cleaned_text

#### dataset

In [4]:
data = pd.read_csv('doj_data.csv')

In [5]:
data.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,cleaned_title_summary,label
0,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,justice department support competition low pri...,False
1,Readout of Pardon Attorney’s Outreach Efforts ...,"During the month of April, which is recognized...",https://www.justice.gov//opa/pr/readout-pardon...,2024-04-29,readout pardon attorney outreach effort second...,False
2,Two Former Missouri Health Care Charity Execut...,"Two former executives of a Springfield, Missou...",https://www.justice.gov//opa/pr/two-former-mis...,2024-04-29,missouri health care charity executive sentenc...,True


In [6]:
change_labels = lambda x: 1 if x==True else 0
data['label'] = data['label'].apply(change_labels)

In [7]:
data.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,cleaned_title_summary,label
0,The Justice Department Supports More Competiti...,The Justice Department’s Antitrust Division to...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,justice department support competition low pri...,0
1,Readout of Pardon Attorney’s Outreach Efforts ...,"During the month of April, which is recognized...",https://www.justice.gov//opa/pr/readout-pardon...,2024-04-29,readout pardon attorney outreach effort second...,0
2,Two Former Missouri Health Care Charity Execut...,"Two former executives of a Springfield, Missou...",https://www.justice.gov//opa/pr/two-former-mis...,2024-04-29,missouri health care charity executive sentenc...,1


# torchtext model

#### train/test/validate data sets

In [8]:
# Create train/test sets
# X_train, X_test, Y_train, Y_test = train_test_split(data['article_summary'].astype(str).tolist(),
X_train, X_test, Y_train, Y_test = train_test_split(data['cleaned_title_summary'].tolist(),
                                                     data['label'].tolist(),
                                                     test_size=0.2,
                                                     stratify=data['label'].tolist(),
                                                     random_state=42)

# Create validation set
X_train, X_valid, Y_train, Y_valid = train_test_split(X_train, Y_train, test_size = 0.1)

In [9]:
train_data = list(zip(Y_train, X_train))
valid_data = list(zip(Y_valid, X_valid))
test_data = list(zip(Y_test, X_test))

#### train data vocabulary

In [10]:
tokenizer = get_tokenizer('basic_english')
train_iter = train_data

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

#### text and label preprocessing pipleines

In [11]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x)

In [12]:
text_pipeline('justice department support competition')

[2, 1, 128, 460]

#### batch collation function

In [13]:
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    
    return label_list.to(device), text_list.to(device), offsets.to(device)


#### classification model

In [14]:
class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=False)
        self.fc = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)

In [15]:
train_iter = train_data
num_class = len(set([label for (label, text) in train_iter]))
print(num_class)

2


In [16]:
vocab_size = len(vocab)
embedding_size = 128
model = TextClassificationModel(vocab_size, embedding_size, num_class).to(device)

#### train and evaluate methods

In [17]:
def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 25
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        predited_label = model(text, offsets)
        loss = criterion(predited_label, label)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predited_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader), total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            predited_label = model(text, offsets)
            loss = criterion(predited_label, label)
            total_acc += (predited_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
            
    return total_acc/total_count

#### train dataloaders

In [18]:
# Hyperparameters
EPOCHS = 10 # epoch
LR = 10  # learning rate
BATCH_SIZE = 16 # batch size for training

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accuracy = None

train_iter = train_data
test_iter = test_data
valid_iter = valid_data

train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(valid_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accuracy_val = evaluate(valid_dataloader)
    
    if total_accuracy is not None and total_accuracy > accuracy_val:
      scheduler.step()
    else:
       total_accuracy = accuracy_val
        
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch, time.time() - epoch_start_time, accuracy_val))
    
    print('-' * 59)

| epoch   1 |    25/   68 batches | accuracy    0.663
| epoch   1 |    50/   68 batches | accuracy    0.818
-----------------------------------------------------------
| end of epoch   1 | time:  0.14s | valid accuracy    0.860 
-----------------------------------------------------------
| epoch   2 |    25/   68 batches | accuracy    0.928
| epoch   2 |    50/   68 batches | accuracy    0.895
-----------------------------------------------------------
| end of epoch   2 | time:  0.10s | valid accuracy    0.884 
-----------------------------------------------------------
| epoch   3 |    25/   68 batches | accuracy    0.957
| epoch   3 |    50/   68 batches | accuracy    0.958
-----------------------------------------------------------
| end of epoch   3 | time:  0.10s | valid accuracy    0.901 
-----------------------------------------------------------
| epoch   4 |    25/   68 batches | accuracy    0.978
| epoch   4 |    50/   68 batches | accuracy    0.980
-------------------------

In [19]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.910


#### evaluate model

In [20]:
def predict(text, text_pipeline):
    with torch.no_grad():
        text = torch.tensor(text_pipeline(text))
        output = model(text, torch.tensor([0]))
        return output.argmax(1).item()

In [21]:
compliance_label = {0: 'not compliance related',
                    1: 'is compliance related'}

text_str = data['cleaned_title_summary'][2]
prediction = compliance_label[predict(text_str, text_pipeline)]

print(f'{text_str[:100]}...{prediction}')

missouri health care charity executive sentence role multimillion dollar bribery embezzlement scheme...is compliance related


### classify article summaries

In [57]:
# DOJ article summary range
start_date = '05/01/2024'
end_date = '05/28/2024'

# Set base DOJ url
base_url = f'https://www.justice.gov/news/press-releases?search_api_fulltext=+&start_date={start_date}&end_date={end_date}&sort_by=field_date'

#### get total pages

In [58]:
website = requests.get(base_url)
soup = BeautifulSoup(website.content, 'html.parser')

pagination = soup.find('ul', {'class': 'usa-pagination__list js-pager__items'})
pages = pagination.findChildren(recursive=False)

max_page_num = 0
# iterate <li> tags to get the max number of pages returned by date values
for i, page in enumerate(pages):
    if i == len(pages) - 1:
        a = page.find('a')['href']
        idx = a.index('page=')
        max_page_num = a[idx:].replace('page=', '')
        max_page_num = int(max_page_num)

#### get article summaries

In [59]:
feed_data = []

In [60]:
%%time
for i in range(max_page_num + 1):
    page_url = base_url + '&page=' + str(i)
    page = requests.get(page_url)
    
    soup = BeautifulSoup(page.content, 'html.parser')

    articles = soup.find('div', {'class': 'rows-wrapper'})
    articles = articles.findChildren(recursive=False)

    print(f'Parsing content for {"page " + str(i + 1)} of {max_page_num + 1}')
    for article in articles:
        title = article.find('a').text.strip()

        summary = article.find('p')
        if summary:
            summary = summary.text.strip()
        else:
            summary = np.nan
        
        url = 'https://www.justice.gov/' + article.find('a')['href']
        date = pd.to_datetime(article.find('time')['datetime']).date()

        # Append feed data objects
        feed_data.append({
            'article_title': title,
            'article_summary': summary,
            'article_url': url,
            'date_published': date
        })

Parsing content for page 1 of 11
Parsing content for page 2 of 11
Parsing content for page 3 of 11
Parsing content for page 4 of 11
Parsing content for page 5 of 11
Parsing content for page 6 of 11
Parsing content for page 7 of 11
Parsing content for page 8 of 11
Parsing content for page 9 of 11
Parsing content for page 10 of 11
Parsing content for page 11 of 11
CPU times: user 766 ms, sys: 28.9 ms, total: 795 ms
Wall time: 9.93 s


In [61]:
df = pd.DataFrame(feed_data)
print(f'dataframe row count: {len(df)}')
df.head()

dataframe row count: 129


Unnamed: 0,article_title,article_summary,article_url,date_published
0,Attorney General Merrick B. Garland Statement ...,The Justice Department issued the following st...,https://www.justice.gov//opa/pr/attorney-gener...,2024-05-25
1,Doctor Convicted of $70M Medicare Fraud Scheme,A federal jury convicted a Texas doctor today ...,https://www.justice.gov//opa/pr/doctor-convict...,2024-05-24
2,Owner of Arkansas Tree Service Business Pleads...,An Arkansas man pleaded guilty to filing a fal...,https://www.justice.gov//opa/pr/owner-arkansas...,2024-05-24
3,Florida Businessman Daniel Hurt to Pay Over $2...,"Daniel Hurt, who owned and/or operated Fountai...",https://www.justice.gov//opa/pr/florida-busine...,2024-05-24
4,Former CIA Officer Pleads Guilty to Conspiracy...,"Alexander Yuk Ching Ma, 71, of Honolulu, a for...",https://www.justice.gov//opa/pr/former-cia-off...,2024-05-24


In [64]:
# Set label output values
compliance_label = {0: 'False',
                    1: 'True'}

# Iterate dataframe
for idx, row in df.iterrows():
    title = str(row['article_title'])
    summary = str(row['article_summary'])
    text_str = title + " " + summary
    # cleaned_text_str = clean_text(text_str)
    prediction = compliance_label[predict(text_str, text_pipeline)]

    df.loc[idx, "compliance_related"] = prediction

df.head()

Unnamed: 0,article_title,article_summary,article_url,date_published,compliance_related
0,Attorney General Merrick B. Garland Statement ...,The Justice Department issued the following st...,https://www.justice.gov//opa/pr/attorney-gener...,2024-05-25,False
1,Doctor Convicted of $70M Medicare Fraud Scheme,A federal jury convicted a Texas doctor today ...,https://www.justice.gov//opa/pr/doctor-convict...,2024-05-24,True
2,Owner of Arkansas Tree Service Business Pleads...,An Arkansas man pleaded guilty to filing a fal...,https://www.justice.gov//opa/pr/owner-arkansas...,2024-05-24,True
3,Florida Businessman Daniel Hurt to Pay Over $2...,"Daniel Hurt, who owned and/or operated Fountai...",https://www.justice.gov//opa/pr/florida-busine...,2024-05-24,True
4,Former CIA Officer Pleads Guilty to Conspiracy...,"Alexander Yuk Ching Ma, 71, of Honolulu, a for...",https://www.justice.gov//opa/pr/former-cia-off...,2024-05-24,True


In [65]:
print(f'Number of predicted articles: {len(df)}')

Number of predicted articles: 129


In [67]:
df.to_csv('predicted doj articles.xlsx', index=False)