### Imports

In [2]:
import pandas as pd
import numpy as np

import re
import unicodedata
import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))
import spacy
nlp = spacy.load("en_core_web_sm")

from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch

### Functions

#### clean text

In [3]:
def clean_text(doc):
    
    # normalize Text
    doc = doc.lower()

    # remove unnecessary whitespaces
    doc = re.sub('\s+', ' ', doc)
    doc = doc.strip()
    
    # remove html tags
    doc = re.sub('<.*?>', '', doc)
    
    # remove email addresses
    doc = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', '', doc)
    
    # remove url
    doc = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', doc)
    
    # remove accented characters
    doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # remove special symbols/punctuation
    doc = re.sub(r'[^\w ]+', '', doc)

    # remove stopwords
    doc = ' '.join([word for word in doc.split() if word not in english_stopwords])

    # lemmatization
    text = []
    for tok in nlp(doc):
        text.append(tok.lemma_)    
    
    doc = ' '.join(text)

    # lemmatizer = WordNetLemmatizer()
    # doc = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(doc)])

    return doc

### Inference

In [4]:
df_infer = pd.read_csv('data/doj_data_May2024.csv')
df_infer.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label
0,Justice Department and State Coalition Restore...,"Today, the Justice Department filed a proposed...",https://www.justice.gov//opa/pr/justice-depart...,2024-05-30,Justice Department and State Coalition Restore...,justice department state coalition restore com...,0
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,Final Texas Defendants Plead Guilty to Conspir...,final texas defendant plead guilty conspiracy ...,1
2,Former Investment Banker and Registered Broker...,"A former investment banker, who was formerly a...",https://www.justice.gov//opa/pr/former-investm...,2024-05-30,Former Investment Banker and Registered Broker...,former investment banker register broker sente...,1


In [5]:
# Combine title and summary
df_infer['title_summary'] = df_infer['article_title'].astype(str) + " " + df_infer['article_summary'].astype(str)

# Function call to clean itle and summary text
df_infer['cleaned_title_summary'] = df_infer['title_summary'].apply(clean_text)

df_infer.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label
0,Justice Department and State Coalition Restore...,"Today, the Justice Department filed a proposed...",https://www.justice.gov//opa/pr/justice-depart...,2024-05-30,Justice Department and State Coalition Restore...,justice department state coalition restore com...,0
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,Final Texas Defendants Plead Guilty to Conspir...,final texas defendant plead guilty conspiracy ...,1
2,Former Investment Banker and Registered Broker...,"A former investment banker, who was formerly a...",https://www.justice.gov//opa/pr/former-investm...,2024-05-30,Former Investment Banker and Registered Broker...,former investment banker register broker sente...,1


#### via pipeline

In [6]:
# from transformers import pipeline

# classifier = pipeline('sentiment-analysis', model=finetuned_model)
# classified_texts = classifier(texts)

# # print(classified_texts[:3)
# # print()
# print('classified summaries done.')

#### via pure torch (replicated pipeline)

In [7]:
def model_infer(checkpoint, text_list) -> list:
    
    # Load pretrained tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
     
    # Load pretrained model
    finetuned_model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    
    # List to hold classifed text label booleans and score probabilties 
    classifications = []
    
    # Iterate title/summary texts
    for text in text_list:
       
        # inputs = tokenizer(text, return_tensors='pt')
        inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
        # print(inputs)
        
        # with torch.no_grad():
        outputs = finetuned_model(**inputs)
        # print(outputs)
    
        predicted_class_id = outputs.logits.argmax().item()
        # print(predicted_class_id)
        
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
        score = np.round(predictions[predicted_class_id].item(), 5)
        # print(score)
    
        label = finetuned_model.config.id2label[predicted_class_id]
        # print(label)
        
        classification = {'label': label, 'score': score}
        
        classifications.append(classification)
    
    print(f'Number of classifications: {len(classifications)}')

    return classifications

In [8]:
use_cleaned = False

In [9]:
if use_cleaned:
    # Set title/summary column to list
    titles_summaries = df_infer['cleaned_title_summary'].tolist()
else:
    titles_summaries = df_infer['title_summary'].tolist()

In [10]:
# Cast list items to strings
texts = [str(summary) for summary in titles_summaries]

In [15]:
def make_classifiction_list(classifications) -> list:

    pos_class = 0
    neg_class = 0
    classification_list = []
    
    for classfication in classifications:
        # print(classfication)
        
        label = classfication['label']
        if label == 'TRUE':
            pos_class += 1
        else:
            neg_class += 1
    
        classification_list.append(label)
        
    print(f'Number of positive classes: {pos_class}')
    print(f'Number of negative classes: {neg_class}')

    return classification_list

#### Predictions

##### Bert

In [11]:
finetuned_model = 'finetuned_bert_model/'

In [12]:
# Function calls to get model classifcations
classifications = model_infer(checkpoint=finetuned_model, text_list=texts)
classify_list = make_classifiction_list(classifications)

Number of classifications: 144
Number of positive classes: 71
Number of negative classes: 73


In [13]:
df_infer['bert_preds'] = classify_list

In [14]:
df_infer.head(2)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label,bert_preds
0,Former Investment Banker and Registered Broker...,"A former investment banker, who was formerly a...",https://www.justice.gov//opa/pr/former-investm...,2024-05-30,Former Investment Banker and Registered Broker...,former investment banker register broker sente...,True,True
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,Final Texas Defendants Plead Guilty to Conspir...,final texas defendant plead guilty conspiracy ...,True,True


##### Distilbert

In [16]:
finetuned_model = 'finetuned_distilbert_model/'

In [17]:
# Function calls to get model classifcations
classifications = model_infer(checkpoint=finetuned_model, text_list=texts)
classify_list = make_classifiction_list(classifications)

Number of classifications: 144
Number of positive classes: 73
Number of negative classes: 71


In [18]:
df_infer['distilbert_preds'] = classify_list

In [24]:
df_infer['distilbert_preds'] = df_infer['distilbert_preds'].astype(bool)
df_infer['distilbert_preds'] = df_infer['distilbert_preds'].astype(int)

In [25]:
df_infer.head(2)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label,distilbert_preds
0,Justice Department and State Coalition Restore...,"Today, the Justice Department filed a proposed...",https://www.justice.gov//opa/pr/justice-depart...,2024-05-30,Justice Department and State Coalition Restore...,justice department state coalition restore com...,0,1
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,Final Texas Defendants Plead Guilty to Conspir...,final texas defendant plead guilty conspiracy ...,1,1


In [27]:
df_out = df_infer.copy()
df_out.drop(columns=['title_summary', 'cleaned_title_summary'], inplace=True)
df_out.head(2)

Unnamed: 0,article_title,article_summary,article_url,date_published,label,distilbert_preds
0,Justice Department and State Coalition Restore...,"Today, the Justice Department filed a proposed...",https://www.justice.gov//opa/pr/justice-depart...,2024-05-30,0,1
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,1,1


In [28]:
df_out.to_csv('data/doj_preds.csv', index=False)

In [21]:
# df_infer.iloc[85]['article_summary']

# TEST (KEYWORD DETECTION)

In [38]:
df_keywords = pd.read_csv('compliance words/compliance_keywords.csv')
df_keywords['lemmas'] = df_keywords['compliance_terms'].apply(clean_text)

Unnamed: 0,compliance_terms,lemmas
0,accounting,account
1,accounting practices,accounting practice
2,accounting fraud,accounting fraud
3,advance fee fraud,advance fee fraud
4,antitrust violations,antitrust violation
5,asset,asset
6,misappropriation,misappropriation
7,stripping,strip
8,bait and switch,bait switch
9,bearer shares,bearer share


In [45]:
# df_out.head()

filtered_df = df_out[df_out['bert_preds'] == 'TRUE']
# keywords_list = list(df_keywords['lemmas'])
# filtered_df = df_out[df_out['bert_preds'] == 'TRUE'].isin(keywords_list)

filtered_df
# filtered_df = df_keywords[df_keywords['Name'].isin(list_of_strings)]

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label,bert_preds,distilbert_preds
0,Former Investment Banker and Registered Broker...,"A former investment banker, who was formerly a...",https://www.justice.gov//opa/pr/former-investm...,2024-05-30,Former Investment Banker and Registered Broker...,former investment banker register broker sente...,True,TRUE,TRUE
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,Final Texas Defendants Plead Guilty to Conspir...,final texas defendant plead guilty conspiracy ...,True,TRUE,TRUE
2,Two Estonian Nationals Extradited from Estonia...,Two Estonian nationals will make their initial...,https://www.justice.gov//opa/pr/two-estonian-n...,2024-05-30,Two Estonian Nationals Extradited from Estonia...,two estonian national extradite estonia united...,True,TRUE,TRUE
6,Disbarred Attorney Pleads Guilty to Promoting ...,A disbarred California attorney pleaded guilty...,https://www.justice.gov//opa/pr/disbarred-atto...,2024-05-29,Disbarred Attorney Pleads Guilty to Promoting ...,disbar attorney plead guilty promote 95 m cryp...,True,TRUE,TRUE
7,911 S5 Botnet Dismantled and Its Administrator...,A court-authorized international law enforceme...,https://www.justice.gov//opa/pr/911-s5-botnet-...,2024-05-29,911 S5 Botnet Dismantled and Its Administrator...,911 s5 botnet dismantle administrator arrest c...,True,TRUE,TRUE
...,...,...,...,...,...,...,...,...,...
136,Extradited Nigerian National Convicted of Busi...,"A federal jury in New Haven, Connecticut, conv...",https://www.justice.gov//opa/pr/extradited-nig...,2024-05-01,Extradited Nigerian National Convicted of Busi...,extradite nigerian national convict business e...,True,TRUE,TRUE
139,Sodinokibi/REvil Affiliate Sentenced for Role ...,A Ukrainian national was sentenced today to 13...,https://www.justice.gov//opa/pr/sodinokibirevi...,2024-05-01,Sodinokibi/REvil Affiliate Sentenced for Role ...,sodinokibirevil affiliate sentence role 700 m ...,True,TRUE,TRUE
140,Staffing Company to Pay $2.7M for Alleged Fail...,"Insight Global LLC, headquartered in Atlanta h...",https://www.justice.gov//opa/pr/staffing-compa...,2024-05-01,Staffing Company to Pay $2.7M for Alleged Fail...,staff company pay 27 m allege failure provide ...,True,TRUE,TRUE
142,Elara Caring Agrees to Pay $4.2 Million to Set...,"Elara Caring, and its wholly owned subsidiarie...",https://www.justice.gov//opa/pr/elara-caring-a...,2024-05-01,Elara Caring Agrees to Pay $4.2 Million to Set...,elara caring agree pay 42 million settle false...,True,TRUE,TRUE


# Test (SUMMARIZATION)

In [22]:
# from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
# checkpoint = 'facebook/bart-large-cnn'
# summary_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [23]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")



In [24]:
texts[1]

'Final Texas Defendants Plead Guilty to Conspiracy to Commit Mail and Wire Fraud and Aggravated Identity Theft A Texas man pleaded guilty on Friday to conspiracy to commit mail and wire fraud and aggravated identity theft, and a Texas women pleaded guilty yesterday to conspiracy to commit mail and wire fraud both in connection with a wide-ranging scheme to defraud the IRS. In all, seven defendants have now pleaded guilty to this scheme that sought over $111 million in fraudulent tax refunds.'

In [25]:
##### Adjust maximum and minimum lengths. Note, higher max length results in slower run time.
max_len = 20
min_len = 5

In [26]:
print(summarizer(texts[1], max_length=max_len, min_length=min_len, do_sample=True))

[{'summary_text': 'A Texas man pleaded guilty on Friday to conspiracy to commit mail and wire fraud and aggravated'}]


In [27]:
for i, classfication in enumerate(classifications[:10]):
    if classfication['label'] == 'TRUE':
        # print(texts[i])
        # print(summarizer(texts[i], max_length=max_len, min_length=min_len, do_sample=True))

[{'summary_text': 'A former investment banker and registered broker was sentenced to three years and five months in prison'}]
[{'summary_text': 'Two Texas defendants plead guilty to conspiracy to commit mail and wire fraud and aggravated identity theft'}]
[{'summary_text': 'Two Estonian nationals will make their initial appearance in the U.S. District Court'}]
[{'summary_text': 'Disbarred California attorney pleads guilty to conspiring to operate a cryptocurrency Ponzi'}]
[{'summary_text': '911 S5 Botnet Dismantled and Its Administrator Arrested in Coordinated International'}]
[{'summary_text': 'Two top executives agreed to pay a total of $12 million to resolve allegations that they'}]
