### Imports

In [1]:
import re
import unicodedata
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
english_stopwords = set(stopwords.words('english'))


import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import datasets
from datasets import Dataset

import torch

[nltk_data] Downloading package wordnet to /Users/wes/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/wes/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/wes/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Training data

#### Read data

In [2]:
data = pd.read_csv('data/doj_data.csv')
data.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,Attorney General Merrick B. Garland Statement ...,attorney general merrick b garland statement s...,False
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Recovers Fraudulent Transfe...,justice department recover fraudulent transfer...,True
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Secures Agreement to Resolv...,justice department secure agreement resolve se...,False


In [3]:
change_labels = lambda x: 1 if x == True else 0

data['label'] = data['label'].apply(change_labels)
data.head(3)

Unnamed: 0,article_title,article_summary,article_url,date_published,title_summary,cleaned_title_summary,label
0,Attorney General Merrick B. Garland Statement ...,"This afternoon, a Deputy U.S. Marshal and two ...",https://www.justice.gov//opa/pr/attorney-gener...,2024-04-29,Attorney General Merrick B. Garland Statement ...,attorney general merrick b garland statement s...,0
1,Justice Department Recovers Fraudulent Transfe...,The Justice Department announced today that it...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Recovers Fraudulent Transfe...,justice department recover fraudulent transfer...,1
2,Justice Department Secures Agreement to Resolv...,The Justice Department announced today that Ir...,https://www.justice.gov//opa/pr/justice-depart...,2024-04-29,Justice Department Secures Agreement to Resolv...,justice department secure agreement resolve se...,0


In [4]:
print(f'Dataframe records: {len(data)}')

Dataframe records: 6052


#### Train/Test split

In [5]:
use_cleaned = False

In [6]:
if use_cleaned:
    title_summary = data['cleaned_title_summary'].tolist()
else:
    title_summary = data['title_summary'].tolist()
    
# Create train/test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(title_summary,
                                                                      data['label'].tolist(),
                                                                      test_size=0.2,
                                                                      stratify=data['label'].tolist(),
                                                                      random_state=42)

#### Create DatasetDict

In [7]:
# Cast all items in list to string
train_texts = [str(element) for element in train_texts]
test_texts = [str(element) for element in test_texts]

In [8]:
# Set training and evaluation dataframes
train_df = pd.DataFrame({
    'label' : train_labels,
    'text' : train_texts
})

test_df = pd.DataFrame({
    'label' : test_labels,
    'text' : test_texts
})

In [9]:
# Create dataset(s) from dataframe(s)
train_data = Dataset.from_dict(train_df)
test_data = Dataset.from_dict(test_df)

# Create datasets dictionary
dataset_dict = datasets.DatasetDict({'train': train_data, 
                                     'test': test_data})

In [10]:
# Display dataset dictionary details
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 4841
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 1211
    })
})

### Finetune pretrained models

#### Setup

In [11]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer

import evaluate

In [12]:
accuracy = evaluate.load('accuracy')

In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [14]:
id2label = {0: 'FALSE', 1: 'TRUE'}
label2id = {'FALSE': 0, 'TRUE': 1}

In [15]:
# Tokenizer function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

#### Bert

##### Tokenizer

In [16]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')



In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [18]:
# Tokenize the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/4841 [00:00<?, ? examples/s]

Map:   0%|          | 0/1211 [00:00<?, ? examples/s]

In [19]:
# Set training and eval datasets
train_dataset = tokenized_datasets['train'].shuffle(seed=42)
eval_dataset = tokenized_datasets['test'].shuffle(seed=42)

##### Model

In [20]:
checkpoint = 'bert-base-cased'

In [21]:
# Instantiate bert model
model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased', 
                                                           num_labels=2,
                                                           id2label=id2label, 
                                                           label2id=label2id
                                                          )

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Train

In [22]:
training_args = TrainingArguments(
    output_dir='finetuned_bert_model',
    num_train_epochs=2,
    
    # learning_rate=2e-5,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,

    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=0.1,
    load_best_model_at_end=True,
    save_total_limit=1
)

In [23]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.2378,0.150144,0.962015
2,0.0676,0.133477,0.971924


TrainOutput(global_step=1212, training_loss=0.2071151867164637, metrics={'train_runtime': 1464.3832, 'train_samples_per_second': 6.612, 'train_steps_per_second': 0.828, 'total_flos': 2547441237995520.0, 'train_loss': 0.2071151867164637, 'epoch': 2.0})

#### Distilbert

##### Tokenizer

In [25]:
tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased')



In [26]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [27]:
# Tokenize the dataset
tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map:   0%|          | 0/4841 [00:00<?, ? examples/s]

Map:   0%|          | 0/1211 [00:00<?, ? examples/s]

In [28]:
# Set training and eval datasets
train_dataset = tokenized_datasets['train'].shuffle(seed=42)
eval_dataset = tokenized_datasets['test'].shuffle(seed=42)

##### Model

In [29]:
checkpoint = 'distilbert/distilbert-base-uncased'

In [30]:
# Instantiate distilbert model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, 
                                                           num_labels=2, 
                                                           id2label=id2label, 
                                                           label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##### Train

In [31]:
training_args = TrainingArguments(
    output_dir='finetuned_distilbert_model',
    num_train_epochs=2,
    
    # learning_rate=2e-5,
    # per_device_train_batch_size=16,
    # per_device_eval_batch_size=16,
    
    weight_decay=0.01,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=0.1,
    load_best_model_at_end=True,
    save_total_limit=1
)

In [32]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [33]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1437,0.113251,0.977704
2,0.0475,0.07411,0.98431


TrainOutput(global_step=1212, training_loss=0.12165717165855684, metrics={'train_runtime': 812.9891, 'train_samples_per_second': 11.909, 'train_steps_per_second': 1.491, 'total_flos': 1282549353787392.0, 'train_loss': 0.12165717165855684, 'epoch': 2.0})