Skip to content

Commit

Permalink
Directory structure cleanups
Browse files Browse the repository at this point in the history
  • Loading branch information
jstray committed Jun 13, 2019
1 parent 7c0cfe7 commit fa2f8f8
Show file tree
Hide file tree
Showing 6 changed files with 66,712 additions and 483 deletions.
1 change: 0 additions & 1 deletion .gitignore
@@ -1,4 +1,3 @@
pdfs
source
data
*.csv
4 changes: 4 additions & 0 deletions source/README.md
@@ -0,0 +1,4 @@
ftf-all-filings.tsv is the crowdsourced data entered by volunteers in 2012.
The dc-slug is can be used to get a URL for the original PDF; see download-pdfs.py

Originally from https://www.propublica.org/datastore/dataset/free-the-files-filing-data
66,226 changes: 66,226 additions & 0 deletions source/ftf-all-filings.tsv

Large diffs are not rendered by default.

236 changes: 236 additions & 0 deletions train-unet.py
@@ -0,0 +1,236 @@
from numpy import array
import keras as K
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.engine.input_layer import Input
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Lambda, Conv2DTranspose, concatenate
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.backend import expand_dims, squeeze
import tensorflow as tf
import pandas as pd
import numpy as np
import csv
import re
import pdfplumber
from decimal import Decimal

import wandb
from wandb.keras import WandbCallback

run = wandb.init()
config = run.config
config.epochs = 25

# Thanks, StackOverflow. This "undoes" a 1D convolution, by combining upsampling plus convolution.
def Conv1DTranspose(input_tensor, filters, kernel_size, strides=2, padding='same'):
x = Lambda(lambda x: expand_dims(x, axis=2))(input_tensor)
x = Conv2DTranspose(filters=filters, kernel_size=(kernel_size, 1), strides=(strides, 1), padding=padding)(x)
x = Lambda(lambda x: squeeze(x, axis=2))(x)
return x

# Configuration
read_docs = 10000 # how many docs to load, at most
max_doc_length = 4096
vocab_size = 5000
target_thresh = 0.9
augment_dims = 4 # number of features per token, other then token type

wandb.log({'algorithm':'U-net with position and dollar marker'})

# Generator that reads all our training data
# For each document, yields an array of dictionaries, each of which is a token
def input_docs(max_docs=None):
incsv = csv.DictReader(open('data/training.csv', mode='r'))

# Reconstruct documents by concatenating all rows with the same slug
active_slug = None
doc_rows = []
num_docs = 0

for row in incsv:
# throw out tokens that are too short, they won't help us
token = row['token']
if len(token) < 3:
continue

if row['slug'] != active_slug:
if active_slug:
yield doc_rows
num_docs += 1
if max_docs and num_docs >= max_docs:
return
doc_rows = [row]
active_slug = row['slug']
else:
doc_rows.append(row)

yield doc_rows


# --- additional features ----
def is_dollar_amount(s):
return re.search(r'\$?\d[\d,]+(\.\d\d)?',s) != None

def augment_row(row):
return [ float(row['page']),
float(row['x0']),
float(row['y0']),
float(is_dollar_amount(row['token'])) ]

def pad_list(x,maxlen,padv):
n = len(x)
if n>maxlen:
return x[:maxlen]
elif n<maxlen:
return x + [padv]*(maxlen-n)
else:
return x


# --- Create training data ---
print('Loading training data...')
docs = []
labels = []
augment = []
for docrows in input_docs(max_docs=read_docs):
# reconstruct document text (will be tokenized again below, huh)
docs.append(' '.join([row['token'] for row in docrows]))

# threshold fuzzy matching score with our target field, to get binary labels
labels.append([(0 if float(row['gross_amount']) < target_thresh else 1) for row in docrows])

augment.append( pad_list([augment_row(row) for row in docrows],
max_doc_length,
[0]*augment_dims))

print(f'Loaded {len(docs)}')
max_length = max([len(x) for x in labels])
print(f'Max document size {max_length}')
avg_length = sum([len(x) for x in labels])/len(labels)
print(f'Average document size {avg_length}')

# integer encode the documents, truncate to max_doc_length
encoded_docs = [one_hot(d, vocab_size) for d in docs]
x = pad_sequences(encoded_docs, maxlen=max_doc_length, dtype=np.float32, padding='post', truncating='post')
x = np.expand_dims(x, axis=2)
a = np.array(augment, dtype=np.float32)
x = np.concatenate([x, a], axis=2)

# Truncate to max_doc_length
y = pad_sequences(labels, maxlen=max_doc_length, padding='post', truncating='post')

# --- Specify network ---

# We use a U-net to handle long range dependencies between tokens
indata = Input((max_doc_length, augment_dims+1))
tok_word = Lambda( lambda x: squeeze(K.backend.slice(x, (0,0,0), (-1,-1,1)),axis=2))(indata)
tok_feature = Lambda( lambda x: K.backend.slice(x, (0,0,1), (-1,-1,-1)))(indata)
embed = Embedding(vocab_size, 32)(tok_word)
embed = concatenate([embed, tok_feature], axis=2)

c1 = Conv1D(filters=8, kernel_size=5, padding='same')(embed) # 4096
p1 = MaxPooling1D()(c1)
c2 = Conv1D(filters=16, kernel_size=5, padding='same')(p1) # 2048
p2 = MaxPooling1D()(c2)
c3 = Conv1D(filters=32, kernel_size=5, padding='same')(p2) # 1024
p3 = MaxPooling1D()(c3)
c4 = Conv1D(filters=64, kernel_size=5, padding='same')(p3) # 512
p4 = MaxPooling1D()(c4) # 256

c5 = Conv1D(filters=64, kernel_size=5, padding='same')(p4) # 256

c6 = Conv1DTranspose(c5, filters=64, kernel_size=5, padding='same') # 512
u6 = concatenate([c4,c6], axis=2) # 512 x 128

c7 = Conv1DTranspose(u6, filters=32, kernel_size=5, padding='same') # 1024
u7 = concatenate([c3,c7], axis=2) # 1024 x 64

c8 = Conv1DTranspose(u7, filters=16, kernel_size=5, padding='same') # 2048
u8 = concatenate([c2,c8], axis=2) # 2048 x 32

c9 = Conv1DTranspose(u8, filters=8, kernel_size=5, padding='same') # 4096
u9 = concatenate([c1,c9], axis=2) # 4096 x 16

# This last convolution produces the target token scores
c10 = Conv1D(filters=1, kernel_size=10, padding='same', activation='relu')(c9) # 4096 x 1
f = Flatten()(c10)

model = Model(inputs=[indata], outputs=[f])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())


# --- Go! ----

model.fit(
x=x,
y=y,
epochs=10,
validation_split=0.2,
callbacks=[WandbCallback()])

# --- Log output PDF images ---

# convert a single row of document data (one token) to bbox format needed for drawing
def docrow_to_bbox(t):
return [Decimal(t['x0']), Decimal(t['y0']), Decimal(t['x1']), Decimal(t['y1'])]

cnt=0
for doc_idx,doc_rows in enumerate(input_docs(max_docs=read_docs)):
slug = doc_rows[0]['slug']
doc_rows = doc_rows[:max_doc_length]
fname = 'pdfs/' + slug + '.pdf'
try:
pdf = pdfplumber.open(fname)
except Exception as e:
# If the file's not there, that's fine -- we use available PDFs to define what to see
continue

print('Rendering output for ' + fname)

# Get the correct answers: find the indices of the token(s) labelled 1
target_idx = [idx for (idx,val) in enumerate(y[doc_idx]) if val==1]

z = np.array([x[doc_idx]])
predict = model.predict(z)
predict = predict.squeeze(axis=0)

# print our best guess for each dcoument
answer_idx = np.argmax(y[doc_idx])
print(f"Correct answer: {doc_rows[answer_idx]['token']} with score {y[doc_idx][answer_idx]}")
output_idx = np.argmax(predict)
print(f"Best output: {doc_rows[output_idx]['token']} with score {predict[output_idx]}")

# Draw the machine output: get a score for each token
page_images=[]
for pagenum,page in enumerate(pdf.pages):
im = page.to_image(resolution=300)
num_pages = len(pdf.pages)>0
if num_pages>1:
current_page = pagenum/float(len(pdf.pages)-1) # training data has 0..1 for page range
else:
current_page = 0.0
current_page = str(current_page)

# Draw target tokens
target_toks = [doc_rows[i] for i in target_idx if doc_rows[i]['page']==current_page]
rects = [ docrow_to_bbox(t) for t in target_toks]
im.draw_rects(rects, stroke='blue', stroke_width=3, fill=None)

# Draw guesses
for idx,tok in enumerate(doc_rows):
if predict[idx]>0.1 and tok['page']==current_page:
c = int(255*predict[idx])
im.draw_rect(docrow_to_bbox(tok),
stroke=(255, 255-c, 255-c),
fill=None)

page_images.append(wandb.Image(im.annotated, caption='page ' + str(pagenum)))

wandb.log({slug: page_images})

cnt+=1
if cnt==10:
break

0 comments on commit fa2f8f8

Please sign in to comment.