### Imports

In [1]:
import re
import unicodedata
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
english_stopwords = set(stopwords.words('english'))


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset

import torch

[nltk_data] Downloading package wordnet to /Users/wes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/wes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/wes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Training data

#### Read data

In [2]:
data = pd.read_csv('data/doj_data.csv')
data.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,Attorney General Merrick B. Garland Statement ...,attorney general merrick b garland statement s...,False
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Recovers Fraudulent Transfe...,justice department recovers fraudulent transfe...,True
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Secures Agreement to Resolv...,justice department secures agreement resolve s...,False


In [3]:
change_labels = lambda x: 1 if x == True else 0

data['label'] = data['label'].apply(change_labels)
data.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,Attorney General Merrick B. Garland Statement ...,attorney general merrick b garland statement s...,0
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Recovers Fraudulent Transfe...,justice department recovers fraudulent transfe...,1
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Secures Agreement to Resolv...,justice department secures agreement resolve s...,0


In [4]:
print(f'Dataframe records: {len(data)}')

Dataframe records: 6052


#### Train/Test split

In [5]:
# Create train/test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(data['cleaned_title_summary'].tolist(),
                                                                      data['label'].tolist(),
                                                                      test_size=0.2,
                                                                      stratify=data['label'].tolist(),
                                                                      random_state=42)

#### Create DatasetDict

In [6]:
# Cast all items in list to string
train_texts = [str(element) for element in train_texts]
test_texts = [str(element) for element in test_texts]

In [7]:
# Set training and evaluation dataframes
train_df = pd.DataFrame({
    'label' : train_labels,
    'text' : train_texts
})

test_df = pd.DataFrame({
    'label' : test_labels,
    'text' : test_texts
})

In [8]:
# Create dataset(s) from dataframe(s)
train_data = Dataset.from_dict(train_df)
test_data = Dataset.from_dict(test_df)

# Create datasets dictionary
dataset_dict = datasets.DatasetDict({'train': train_data, 
                                     'test': test_data})

In [9]:
# Display dataset dictionary details
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 4841
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1211
    })
})

### Finetune pretrained models

#### Setup

In [10]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import evaluate

In [11]:
accuracy = evaluate.load('accuracy')

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [13]:
id2label = {0: 'FALSE', 1: 'TRUE'}
label2id = {'FALSE': 0, 'TRUE': 1}

In [14]:
# Tokenizer function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

#### Bert

In [15]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')



In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# Tokenize the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/4841 [00:00<?, ? examples/s]

Map:   0%|          | 0/1211 [00:00<?, ? examples/s]

In [18]:
# Set training and eval datasets
train_dataset = tokenized_datasets['train'].shuffle(seed=42)
eval_dataset = tokenized_datasets['test'].shuffle(seed=42)

In [19]:
# Instantiate bert model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', 
                                                           num_labels=2,
                                                           id2label=id2label, 
                                                           label2id=label2id
                                                          )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
training_args = TrainingArguments(
    output_dir='finetuned_bert_model',
    num_train_epochs=2,
    
    # learning_rate=2e-5,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,

    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=0.1,
    load_best_model_at_end=True,
    save_total_limit=1
)

In [21]:
# Set training and eval datasets
train_dataset = tokenized_datasets['train'].shuffle(seed=42)
eval_dataset = tokenized_datasets['test'].shuffle(seed=42)

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [23]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1823,0.069025,0.985962
2,0.0667,0.06467,0.985962


TrainOutput(global_step=1212, training_loss=0.1413930638788557, metrics={'train_runtime': 1466.5862, 'train_samples_per_second': 6.602, 'train_steps_per_second': 0.826, 'total_flos': 2547441237995520.0, 'train_loss': 0.1413930638788557, 'epoch': 2.0})

#### Distilbert

In [24]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')



In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [26]:
# Tokenize the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/4841 [00:00<?, ? examples/s]

Map:   0%|          | 0/1211 [00:00<?, ? examples/s]

In [27]:
# Set training and eval datasets
train_dataset = tokenized_datasets['train'].shuffle(seed=42)
eval_dataset = tokenized_datasets['test'].shuffle(seed=42)

In [28]:
# Instantiate distilbert model
model = AutoModelForSequenceClassification.from_pretrained('distilbert/distilbert-base-uncased', 
                                                           num_labels=2, 
                                                           id2label=id2label, 
                                                           label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
training_args = TrainingArguments(
    output_dir='finetuned_distilbert_model',
    num_train_epochs=2,
    
    # learning_rate=2e-5,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,
    
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=0.1,
    load_best_model_at_end=True,
    save_total_limit=1
)

In [30]:
# Set training and eval datasets
train_dataset = tokenized_datasets['train'].shuffle(seed=42)
eval_dataset = tokenized_datasets['test'].shuffle(seed=42)

In [31]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [32]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1039,0.064406,0.988439
2,0.0649,0.04169,0.992568


TrainOutput(global_step=1212, training_loss=0.12198464154410284, metrics={'train_runtime': 827.71, 'train_samples_per_second': 11.697, 'train_steps_per_second': 1.464, 'total_flos': 1282549353787392.0, 'train_loss': 0.12198464154410284, 'epoch': 2.0})

### Functions

#### clean text

In [33]:
def clean_text(doc):
    
    # normalize Text
    doc = doc.lower()

    # remove unnecessary whitespaces
    doc = re.sub('\s+', ' ', doc)
    doc = doc.strip()
    
    # remove html tags
    doc = re.sub('<.*?>', '', doc)
    
    # remove email addresses
    doc = re.sub(r'([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)', '', doc)
    
    # remove url
    doc = re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', doc)
    
    # remove accented characters
    doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # remove special symbols/punctuation
    doc = re.sub(r'[^\w ]+', '', doc)

    # remove stopwords
    doc = ' '.join([word for word in doc.split() if word not in english_stopwords])

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    doc = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(doc)])

    return doc

### Inference

In [57]:
df_infer = pd.read_csv('data/predicted doj articles.csv')
df_infer.drop(columns=['compliance_related'], inplace=True)
df_infer.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published
0,Attorney General Merrick B. Garland Statement ...,The Justice Department issued the following st...,https://www.justice.gov//opa/pr/attorney-gener...,2024-05-25
1,Doctor Convicted of $70M Medicare Fraud Scheme,A federal jury convicted a Texas doctor today ...,https://www.justice.gov//opa/pr/doctor-convict...,2024-05-24
2,Owner of Arkansas Tree Service Business Pleads...,An Arkansas man pleaded guilty to filing a fal...,https://www.justice.gov//opa/pr/owner-arkansas...,2024-05-24


In [58]:
# Combine title and summary
df_infer['cleaned_title_summary'] = df_infer['article_title'].astype(str) + " " + df_infer['article_summary'].astype(str)

# Function call to clean itle and summary text
df_infer['cleaned_title_summary'] = df_infer['cleaned_title_summary'].apply(clean_text)

df_infer.head(3)


Unnamed: 0,article_title,article_summary,article_url,date_published,cleaned_title_summary
0,Attorney General Merrick B. Garland Statement ...,The Justice Department issued the following st...,https://www.justice.gov//opa/pr/attorney-gener...,2024-05-25,attorney general merrick b garland statement e...
1,Doctor Convicted of $70M Medicare Fraud Scheme,A federal jury convicted a Texas doctor today ...,https://www.justice.gov//opa/pr/doctor-convict...,2024-05-24,doctor convicted 70m medicare fraud scheme fed...
2,Owner of Arkansas Tree Service Business Pleads...,An Arkansas man pleaded guilty to filing a fal...,https://www.justice.gov//opa/pr/owner-arkansas...,2024-05-24,owner arkansas tree service business pleads gu...


In [59]:
# Set title/summary column to list
titles_summaries = df_infer['cleaned_title_summary'].tolist()

In [60]:
# Cast list items to strings
texts = [str(summary) for summary in titles_summaries]

In [61]:
finetuned_model = 'finetuned_bert_model/'

#### via pipeline

In [62]:
from transformers import pipeline

classifier = pipeline('sentiment-analysis', model=finetuned_model)
classified_texts = classifier(texts)

# print(classified_texts[:3)
# print()
print('classified summaries done.')

classified summaries done.


#### via pure torch (replicated pipeline)

In [63]:
# Load pretrained tokenizer
tokenizer = AutoTokenizer.from_pretrained(finetuned_model)
 
# Load pretrained model
dilbert_finetuned_model = AutoModelForSequenceClassification.from_pretrained(finetuned_model)

# List to hold classifed text label booleans and score probabilties 
classifications = []

# Iterate title/summary texts
for text in texts:
   
    # inputs = tokenizer(text, return_tensors='pt')
    inputs = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    # print(inputs)
    
    # with torch.no_grad():
    outputs = dilbert_finetuned_model(**inputs)
    # print(outputs)

    predicted_class_id = outputs.logits.argmax().item()
    # print(predicted_class_id)
    
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
    score = np.round(predictions[predicted_class_id].item(), 5)
    # print(score)

    label = dilbert_finetuned_model.config.id2label[predicted_class_id]
    # print(label)
    
    classification = {'label': label, 'score': score}
    classifications.append(classification)

print(f'Number of classifications: {len(classifications)}')

Number of classifications: 129


In [64]:
for classfication in classifications[:3]:
    print(classfication)

{'label': 'FALSE', 'score': 0.99806}
{'label': 'TRUE', 'score': 0.99928}
{'label': 'TRUE', 'score': 0.99924}


In [65]:
out_df = pd.DataFrame(classifications)
# type(classifications)

In [66]:
out_df.to_csv('data/preds.csv', index=False)

In [67]:
# df_infer.iloc[85]['article_summary']