Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
66,712 additions
and
483 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,3 @@ | ||
pdfs | ||
source | ||
data | ||
*.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
ftf-all-filings.tsv is the crowdsourced data entered by volunteers in 2012. | ||
The dc-slug is can be used to get a URL for the original PDF; see download-pdfs.py | ||
|
||
Originally from https://www.propublica.org/datastore/dataset/free-the-files-filing-data |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
from numpy import array | ||
import keras as K | ||
from keras.preprocessing.text import one_hot | ||
from keras.preprocessing.sequence import pad_sequences | ||
from keras.engine.input_layer import Input | ||
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Lambda, Conv2DTranspose, concatenate | ||
from keras.layers.embeddings import Embedding | ||
from keras.models import Model | ||
from keras.backend import expand_dims, squeeze | ||
import tensorflow as tf | ||
import pandas as pd | ||
import numpy as np | ||
import csv | ||
import re | ||
import pdfplumber | ||
from decimal import Decimal | ||
|
||
import wandb | ||
from wandb.keras import WandbCallback | ||
|
||
run = wandb.init() | ||
config = run.config | ||
config.epochs = 25 | ||
|
||
# Thanks, StackOverflow. This "undoes" a 1D convolution, by combining upsampling plus convolution. | ||
def Conv1DTranspose(input_tensor, filters, kernel_size, strides=2, padding='same'): | ||
x = Lambda(lambda x: expand_dims(x, axis=2))(input_tensor) | ||
x = Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), strides=(strides, 1), padding=padding)(x) | ||
x = Lambda(lambda x: squeeze(x, axis=2))(x) | ||
return x | ||
|
||
# Configuration | ||
read_docs = 10000 # how many docs to load, at most | ||
max_doc_length = 4096 | ||
vocab_size = 5000 | ||
target_thresh = 0.9 | ||
augment_dims = 4 # number of features per token, other then token type | ||
|
||
wandb.log({'algorithm':'U-net with position and dollar marker'}) | ||
|
||
# Generator that reads all our training data | ||
# For each document, yields an array of dictionaries, each of which is a token | ||
def input_docs(max_docs=None): | ||
incsv = csv.DictReader(open('data/training.csv', mode='r')) | ||
|
||
# Reconstruct documents by concatenating all rows with the same slug | ||
active_slug = None | ||
doc_rows = [] | ||
num_docs = 0 | ||
|
||
for row in incsv: | ||
# throw out tokens that are too short, they won't help us | ||
token = row['token'] | ||
if len(token) < 3: | ||
continue | ||
|
||
if row['slug'] != active_slug: | ||
if active_slug: | ||
yield doc_rows | ||
num_docs += 1 | ||
if max_docs and num_docs >= max_docs: | ||
return | ||
doc_rows = [row] | ||
active_slug = row['slug'] | ||
else: | ||
doc_rows.append(row) | ||
|
||
yield doc_rows | ||
|
||
|
||
# --- additional features ---- | ||
def is_dollar_amount(s): | ||
return re.search(r'\$?\d[\d,]+(\.\d\d)?',s) != None | ||
|
||
def augment_row(row): | ||
return [ float(row['page']), | ||
float(row['x0']), | ||
float(row['y0']), | ||
float(is_dollar_amount(row['token'])) ] | ||
|
||
def pad_list(x,maxlen,padv): | ||
n = len(x) | ||
if n>maxlen: | ||
return x[:maxlen] | ||
elif n<maxlen: | ||
return x + [padv]*(maxlen-n) | ||
else: | ||
return x | ||
|
||
|
||
# --- Create training data --- | ||
print('Loading training data...') | ||
docs = [] | ||
labels = [] | ||
augment = [] | ||
for docrows in input_docs(max_docs=read_docs): | ||
# reconstruct document text (will be tokenized again below, huh) | ||
docs.append(' '.join([row['token'] for row in docrows])) | ||
|
||
# threshold fuzzy matching score with our target field, to get binary labels | ||
labels.append([(0 if float(row['gross_amount']) < target_thresh else 1) for row in docrows]) | ||
|
||
augment.append( pad_list([augment_row(row) for row in docrows], | ||
max_doc_length, | ||
[0]*augment_dims)) | ||
|
||
print(f'Loaded {len(docs)}') | ||
max_length = max([len(x) for x in labels]) | ||
print(f'Max document size {max_length}') | ||
avg_length = sum([len(x) for x in labels])/len(labels) | ||
print(f'Average document size {avg_length}') | ||
|
||
# integer encode the documents, truncate to max_doc_length | ||
encoded_docs = [one_hot(d, vocab_size) for d in docs] | ||
x = pad_sequences(encoded_docs, maxlen=max_doc_length, dtype=np.float32, padding='post', truncating='post') | ||
x = np.expand_dims(x, axis=2) | ||
a = np.array(augment, dtype=np.float32) | ||
x = np.concatenate([x, a], axis=2) | ||
|
||
# Truncate to max_doc_length | ||
y = pad_sequences(labels, maxlen=max_doc_length, padding='post', truncating='post') | ||
|
||
# --- Specify network --- | ||
|
||
# We use a U-net to handle long range dependencies between tokens | ||
indata = Input((max_doc_length, augment_dims+1)) | ||
tok_word = Lambda( lambda x: squeeze(K.backend.slice(x, (0,0,0), (-1,-1,1)),axis=2))(indata) | ||
tok_feature = Lambda( lambda x: K.backend.slice(x, (0,0,1), (-1,-1,-1)))(indata) | ||
embed = Embedding(vocab_size, 32)(tok_word) | ||
embed = concatenate([embed, tok_feature], axis=2) | ||
|
||
c1 = Conv1D(filters=8, kernel_size=5, padding='same')(embed) # 4096 | ||
p1 = MaxPooling1D()(c1) | ||
c2 = Conv1D(filters=16, kernel_size=5, padding='same')(p1) # 2048 | ||
p2 = MaxPooling1D()(c2) | ||
c3 = Conv1D(filters=32, kernel_size=5, padding='same')(p2) # 1024 | ||
p3 = MaxPooling1D()(c3) | ||
c4 = Conv1D(filters=64, kernel_size=5, padding='same')(p3) # 512 | ||
p4 = MaxPooling1D()(c4) # 256 | ||
|
||
c5 = Conv1D(filters=64, kernel_size=5, padding='same')(p4) # 256 | ||
|
||
c6 = Conv1DTranspose(c5, filters=64, kernel_size=5, padding='same') # 512 | ||
u6 = concatenate([c4,c6], axis=2) # 512 x 128 | ||
|
||
c7 = Conv1DTranspose(u6, filters=32, kernel_size=5, padding='same') # 1024 | ||
u7 = concatenate([c3,c7], axis=2) # 1024 x 64 | ||
|
||
c8 = Conv1DTranspose(u7, filters=16, kernel_size=5, padding='same') # 2048 | ||
u8 = concatenate([c2,c8], axis=2) # 2048 x 32 | ||
|
||
c9 = Conv1DTranspose(u8, filters=8, kernel_size=5, padding='same') # 4096 | ||
u9 = concatenate([c1,c9], axis=2) # 4096 x 16 | ||
|
||
# This last convolution produces the target token scores | ||
c10 = Conv1D(filters=1, kernel_size=10, padding='same', activation='relu')(c9) # 4096 x 1 | ||
f = Flatten()(c10) | ||
|
||
model = Model(inputs=[indata], outputs=[f]) | ||
|
||
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']) | ||
print(model.summary()) | ||
|
||
|
||
# --- Go! ---- | ||
|
||
model.fit( | ||
x=x, | ||
y=y, | ||
epochs=10, | ||
validation_split=0.2, | ||
callbacks=[WandbCallback()]) | ||
|
||
# --- Log output PDF images --- | ||
|
||
# convert a single row of document data (one token) to bbox format needed for drawing | ||
def docrow_to_bbox(t): | ||
return [Decimal(t['x0']), Decimal(t['y0']), Decimal(t['x1']), Decimal(t['y1'])] | ||
|
||
cnt=0 | ||
for doc_idx,doc_rows in enumerate(input_docs(max_docs=read_docs)): | ||
slug = doc_rows[0]['slug'] | ||
doc_rows = doc_rows[:max_doc_length] | ||
fname = 'pdfs/' + slug + '.pdf' | ||
try: | ||
pdf = pdfplumber.open(fname) | ||
except Exception as e: | ||
# If the file's not there, that's fine -- we use available PDFs to define what to see | ||
continue | ||
|
||
print('Rendering output for ' + fname) | ||
|
||
# Get the correct answers: find the indices of the token(s) labelled 1 | ||
target_idx = [idx for (idx,val) in enumerate(y[doc_idx]) if val==1] | ||
|
||
z = np.array([x[doc_idx]]) | ||
predict = model.predict(z) | ||
predict = predict.squeeze(axis=0) | ||
|
||
# print our best guess for each dcoument | ||
answer_idx = np.argmax(y[doc_idx]) | ||
print(f"Correct answer: {doc_rows[answer_idx]['token']} with score {y[doc_idx][answer_idx]}") | ||
output_idx = np.argmax(predict) | ||
print(f"Best output: {doc_rows[output_idx]['token']} with score {predict[output_idx]}") | ||
|
||
# Draw the machine output: get a score for each token | ||
page_images=[] | ||
for pagenum,page in enumerate(pdf.pages): | ||
im = page.to_image(resolution=300) | ||
num_pages = len(pdf.pages)>0 | ||
if num_pages>1: | ||
current_page = pagenum/float(len(pdf.pages)-1) # training data has 0..1 for page range | ||
else: | ||
current_page = 0.0 | ||
current_page = str(current_page) | ||
|
||
# Draw target tokens | ||
target_toks = [doc_rows[i] for i in target_idx if doc_rows[i]['page']==current_page] | ||
rects = [ docrow_to_bbox(t) for t in target_toks] | ||
im.draw_rects(rects, stroke='blue', stroke_width=3, fill=None) | ||
|
||
# Draw guesses | ||
for idx,tok in enumerate(doc_rows): | ||
if predict[idx]>0.1 and tok['page']==current_page: | ||
c = int(255*predict[idx]) | ||
im.draw_rect(docrow_to_bbox(tok), | ||
stroke=(255, 255-c, 255-c), | ||
fill=None) | ||
|
||
page_images.append(wandb.Image(im.annotated, caption='page ' + str(pagenum))) | ||
|
||
wandb.log({slug: page_images}) | ||
|
||
cnt+=1 | ||
if cnt==10: | ||
break |
Oops, something went wrong.