### Imports

In [3]:
import re
import unicodedata

import pandas as pd
import numpy as np
from collections import Counter

from datetime import datetime

import nltk
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words('english'))
import spacy
nlp = spacy.load("en_core_web_sm")

### Functions

#### clean text

In [4]:
def clean_text(doc):
    
    # normalize Text
    doc = doc.lower()

    # remove unnecessary whitespaces
    doc = re.sub('\s+', ' ', doc)
    doc = doc.strip()
    
    # remove html tags
    doc = re.sub('<.*?>', '', doc)
    
    # remove email addresses
    doc = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', '', doc)
    
    # remove url
    doc = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', doc)
    
    # remove accented characters
    doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # remove special symbols/punctuation
    doc = re.sub(r'[^\w ]+', '', doc)

    # remove stopwords
    doc = ' '.join([word for word in doc.split() if word not in english_stopwords])

    # lemmatization
    text = []
    for tok in nlp(doc):
        text.append(tok.lemma_)    
    
    doc = ' '.join(text)

    # lemmatizer = WordNetLemmatizer()
    # doc = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(doc)])

    return doc

#### predictions

In [5]:
def get_true_preds(df, col, val):
    mask = df[col] == val
    temp_df = df[mask].copy()
    
    temp_df.reset_index(drop=True, inplace=True)
    
    return temp_df

### Data

#### predictions

In [22]:
df = pd.read_csv('data/doj_preds.csv')
df.head(2)

Unnamed: 0,article_title,article_summary,article_url,date_published,label,distilbert_preds
0,Justice Department and State Coalition Restore...,"Today, the Justice Department filed a proposed...",https://www.justice.gov//opa/pr/justice-depart...,2024-05-30,0,1
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,1,1


In [23]:
print(df.dtypes)

df['formated_date'] = pd.to_datetime(df['date_published'])
df['formated_date'] = df['formated_date'].dt.strftime('%m/%d/%Y')

min_date = df['date_published'].min()
min_date = df['date_published'].max()
min_date

article_title       object
article_summary     object
article_url         object
date_published      object
label                int64
distilbert_preds     int64
dtype: object


'2024-05-30'

In [6]:
# Function call to filter true predictions on Bert model
filtered_df = get_true_preds(df, 'bert_preds', True)

print(f'Number of true predicaed articles: {len(filtered_df)}')
print()
filtered_df.head()

Number of true predicaed articles: 71



Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label,bert_preds,distilbert_preds
0,Former Investment Banker and Registered Broker...,"A former investment banker, who was formerly a...",https://www.justice.gov//opa/pr/former-investm...,2024-05-30,Former Investment Banker and Registered Broker...,former investment banker register broker sente...,True,True,True
1,Final Texas Defendants Plead Guilty to Conspir...,A Texas man pleaded guilty on Friday to conspi...,https://www.justice.gov//opa/pr/final-texas-de...,2024-05-30,Final Texas Defendants Plead Guilty to Conspir...,final texas defendant plead guilty conspiracy ...,True,True,True
2,Two Estonian Nationals Extradited from Estonia...,Two Estonian nationals will make their initial...,https://www.justice.gov//opa/pr/two-estonian-n...,2024-05-30,Two Estonian Nationals Extradited from Estonia...,two estonian national extradite estonia united...,True,True,True
3,Disbarred Attorney Pleads Guilty to Promoting ...,A disbarred California attorney pleaded guilty...,https://www.justice.gov//opa/pr/disbarred-atto...,2024-05-29,Disbarred Attorney Pleads Guilty to Promoting ...,disbar attorney plead guilty promote 95 m cryp...,True,True,True
4,911 S5 Botnet Dismantled and Its Administrator...,A court-authorized international law enforceme...,https://www.justice.gov//opa/pr/911-s5-botnet-...,2024-05-29,911 S5 Botnet Dismantled and Its Administrator...,911 s5 botnet dismantle administrator arrest c...,True,True,True


#### Keywords

In [7]:
df_keywords = pd.read_csv('compliance words/compliance_keywords.csv')
df_keywords['term_lemmas'] = df_keywords['compliance_terms'].apply(clean_text)

df_keywords.drop_duplicates(inplace=True)

# df_keywords.head()

In [8]:
df_keywords['counter'] = 0
df_keywords['row_indexer'] = [[] for _ in range(df_keywords.shape[0])]

In [9]:
print(f'Length of keywords dataframe: {len(df_keywords)}')
print()
df_keywords.head()

Length of keywords dataframe: 144



Unnamed: 0,compliance_terms,term_lemmas,counter,row_indexer
0,accounting,account,0,[]
1,accounting practices,accounting practice,0,[]
2,accounting fraud,accounting fraud,0,[]
3,advance fee fraud,advance fee fraud,0,[]
4,antitrust violations,antitrust violation,0,[]


In [10]:
for t, text_row in filtered_df.iterrows():
    text = text_row['cleaned_title_summary']
    
    c = Counter(text.split())
    text_words_list = list(c.keys())
    
    for k, keyword_row in df_keywords.iterrows():
        keyword = keyword_row['term_lemmas']
        
        if keyword in text_words_list:            
            df_keywords.loc[k, 'counter'] += 1
            df_keywords.loc[k, 'row_indexer'].append(t)

In [11]:
df_keywords[df_keywords['counter'] > 0]

Unnamed: 0,compliance_terms,term_lemmas,counter,row_indexer
0,accounting,account,2,"[20, 32]"
10,bidding,bid,2,"[39, 46]"
11,rigging,rig,2,"[39, 46]"
14,bribery,bribery,4,"[13, 28, 32, 59]"
19,collusion,collusion,1,[43]
20,companies,company,17,"[9, 16, 18, 19, 22, 24, 25, 30, 32, 36, 38, 46..."
21,company,company,17,"[9, 16, 18, 19, 22, 24, 25, 30, 32, 36, 38, 46..."
28,corporate,corporate,2,"[19, 65]"
34,counterfeit,counterfeit,1,[63]
35,counterfeiting,counterfeit,1,[63]


In [12]:
df_mactched_keywords = df_keywords[df_keywords['counter'] > 0].copy()
df_mactched_keywords.reset_index(drop=True, inplace=True)

df_mactched_keywords.head(3)

Unnamed: 0,compliance_terms,term_lemmas,counter,row_indexer,min_published,max_published
0,accounting,account,2,"[20, 32]",2024-05-01,2024-05-30
1,bidding,bid,2,"[39, 46]",2024-05-01,2024-05-30
2,rigging,rig,2,"[39, 46]",2024-05-01,2024-05-30


In [13]:
df_mactched_keywords.to_csv('data/keywords_found.csv')

In [14]:
counter = {}
date_freq = {}
date_keys = filtered_df['date_published'].unique()

for k in reversed(date_keys):
    str_date = str(k)
    date_freq[str_date] = 0

In [15]:
for i, df1 in df_mactched_keywords.iterrows():
    
    row_list = df1['row_indexer']
    term = df1['compliance_terms']
    dates = date_freq.copy()
    counter[term] = dates

    for x in range(len(row_list)):
        ref_idx = row_list[x]

        d0 = filtered_df['date_published'][ref_idx]
        date_format = '%Y-%m-%d'
        date_obj = datetime.strptime(d0, date_format).date()
        str_date = str(date_obj)
        
        date_idx = list(date_freq.keys()).index(str_date)
        counter[term][str_date] += 1
         
# counter['accounting']
# counter

In [16]:
filtered_df.iloc[32]

article_title            Chairman of Multinational Investment Company a...
article_summary          A federal jury in Charlotte, North Carolina, c...
article_url              https://www.justice.gov//opa/pr/chairman-multi...
date_published                                                  2024-05-16
title_summary            Chairman of Multinational Investment Company a...
cleaned_title_summary    chairman multinational investment company comp...
label                                                                 True
bert_preds                                                            True
distilbert_preds                                                      True
Name: 32, dtype: object

In [17]:
df_plot = pd.DataFrame(counter)
df_plot = df_plot.loc[(df_plot!=0).any(axis=1)]

In [19]:
df_plot

Unnamed: 0,accounting,bidding,rigging,bribery,collusion,companies,company,corporate,counterfeit,counterfeiting,financial,fraud,fraudulently,kickbacks,racketeering,scam,scheme,skimming,theft
2024-05-01,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,3,0,0
2024-05-02,0,0,0,0,0,2,2,0,1,1,0,2,0,0,0,0,3,0,0
2024-05-03,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0
2024-05-06,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0
2024-05-07,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2,0,0
2024-05-08,0,1,1,0,0,1,1,0,0,0,0,1,0,0,1,0,0,1,0
2024-05-09,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2024-05-10,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,3,0,0
2024-05-13,0,1,1,0,0,1,1,0,0,0,1,1,0,0,0,0,1,0,0
2024-05-15,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,1,0,0


In [21]:
df_plot.index

Index(['2024-05-01', '2024-05-02', '2024-05-03', '2024-05-06', '2024-05-07',
       '2024-05-08', '2024-05-09', '2024-05-10', '2024-05-13', '2024-05-15',
       '2024-05-16', '2024-05-17', '2024-05-20', '2024-05-21', '2024-05-22',
       '2024-05-23', '2024-05-24', '2024-05-29', '2024-05-30'],
      dtype='object')

In [22]:
df_plot.columns

Index(['accounting', 'bidding', 'rigging', 'bribery', 'collusion', 'companies',
       'company', 'corporate', 'counterfeit', 'counterfeiting', 'financial',
       'fraud', 'fraudulently', 'kickbacks', 'racketeering', 'scam', 'scheme',
       'skimming', 'theft'],
      dtype='object')