# Extracting Text, Tables, and Images from a PDF

Credits
- http://www.unixuser.org/~euske/python/pdfminer/programming.html
- https://github.com/dpapathanasiou/pdfminer-layout-scanner/blob/master/layout_scanner.py

**Requires** PDFMiner (https://github.com/euske/pdfminer)

**NOTE**

Due to a problem in the PDFMiner library, once PDFMiner is installed, you have to set it back to a previous version using:

`pip install --upgrade --ignore-installed slate==0.3 pdfminer==20131113`

## Extract Text from a PDF Using PDFMiner

In [1]:
# System imports
import sys
import os
from binascii import b2a_hex

In [9]:
# PDFMiner imports
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument, PDFNoOutlines
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure, LTImage, LTChar
from pdfminer.pdfdevice import PDFDevice
# For conversion to html
from pdfminer.converter import HTMLConverter
from pdfminer.converter import TextConverter
from cStringIO import StringIO
import re
import csv
import time

In [3]:
# Directory for the extracted image files
image_dir = os.getcwd() + '/PDF-Images'

In [57]:
# Set parameters for analysis.
laparams = LAParams()
# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()
# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(document):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()

In [4]:
# See http://denis.papathanasiou.org/posts/2010.08.04.post.html for code and explanation

# Highest level utility function -- pass in other functions as the second argument
def with_pdf (pdf_doc, fn, pdf_pwd, *args):
    """Open the pdf document, and apply the function, returning the results"""
    result = None
    try:
        # open the pdf file
        fp = open(pdf_doc, 'rb')
        # create a parser object associated with the file object
        parser = PDFParser(fp)
        # create a PDFDocument object that stores the document structure
        doc = PDFDocument(parser)
        # connect the parser and document objects
        parser.set_document(doc)
        # supply the password for initialization
        doc.initialize(pdf_pwd)

        if doc.is_extractable:
            # apply the function and return the result
            result = fn(doc, *args)

        # close the pdf file
        fp.close()
    except IOError:
        # the file doesn't exist or similar problem
        pass
    return result


### 
### Table of Contents
### 

def _parse_toc (doc):
    """With an open PDFDocument object, get the table of contents (toc) data
    [this is a higher-order function to be passed to with_pdf()]"""
    toc = []
    try:
        outlines = doc.get_outlines()
        for (level,title,dest,a,se) in outlines:
            toc.append( (level, title) )
    except PDFNoOutlines:
        pass
    return toc

def get_toc (pdf_doc, pdf_pwd=''):
    """Return the table of contents (toc), if any, for this pdf file"""
    return with_pdf(pdf_doc, _parse_toc, pdf_pwd)

###
### Extracting Images
###

def write_file (folder, filename, filedata, flags='w'):
    """Write the file data to the folder and filename combination
    (flags: 'w' for write text, 'wb' for write binary, use 'a' instead of 'w' for append)"""
    result = False
    if os.path.isdir(folder):
        try:
            file_obj = open(os.path.join(folder, filename), flags)
            file_obj.write(filedata)
            file_obj.close()
            result = True
        except IOError:
            pass
    return result

def determine_image_type (stream_first_4_bytes):
    """Find out the image file type based on the magic number comparison of the first 4 (or 2) bytes"""
    file_type = None
    bytes_as_hex = b2a_hex(stream_first_4_bytes)
    if bytes_as_hex.startswith('ffd8'):
        file_type = '.jpeg'
    elif bytes_as_hex == '89504e47':
        file_type = '.png'
    elif bytes_as_hex == '47494638':
        file_type = '.gif'
    elif bytes_as_hex.startswith('424d'):
        file_type = '.bmp'
    return file_type

def save_image (lt_image, page_number, images_folder):
    """Try to save the image data from this LTImage object, and return the file name, if successful"""
    result = None
    if lt_image.stream:
        file_stream = lt_image.stream.get_rawdata()
        if file_stream:
            file_ext = determine_image_type(file_stream[0:4])
            if file_ext:
                file_name = ''.join([str(page_number), '_', lt_image.name, file_ext])
                if write_file(images_folder, file_name, file_stream, flags='wb'):
                    result = file_name
    return result


###
### Extracting Text
###

def to_bytestring (s, enc='utf-8'):
    """Convert the given unicode string to a bytestring, using the standard encoding,
    unless it's already a bytestring"""
    if s:
        if isinstance(s, str):
            return s
        else:
            return s.encode(enc)

def update_page_text_hash (h, lt_obj, pct=0.2):
    """Use the bbox x0,x1 values within pct% to produce lists of associated text within the hash"""

    x0 = lt_obj.bbox[0]
    x1 = lt_obj.bbox[2]

    key_found = False
    for k, v in h.items():
        hash_x0 = k[0]
        if x0 >= (hash_x0 * (1.0-pct)) and (hash_x0 * (1.0+pct)) >= x0:
            hash_x1 = k[1]
            if x1 >= (hash_x1 * (1.0-pct)) and (hash_x1 * (1.0+pct)) >= x1:
                # the text inside this LT* object was positioned at the same
                # width as a prior series of text, so it belongs together
                key_found = True
                v.append(to_bytestring(lt_obj.get_text()))
                h[k] = v
    if not key_found:
        # the text, based on width, is a new series,
        # so it gets its own series (entry in the hash)
        h[(x0,x1)] = [to_bytestring(lt_obj.get_text())]

    return h

def parse_lt_objs (lt_objs, page_number, images_folder, text=[]):
    """Iterate through the list of LT* objects and capture the text or image data contained in each"""
    text_content = [] 

    page_text = {} # k=(x0, x1) of the bbox, v=list of text strings within that bbox width (physical column)
    for lt_obj in lt_objs:
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            # text, so arrange is logically based on its column width
            page_text = update_page_text_hash(page_text, lt_obj)
        elif isinstance(lt_obj, LTImage):
            # an image, so save it to the designated folder, and note its place in the text 
            saved_file = save_image(lt_obj, page_number, images_folder)
            if saved_file:
                # use html style <img /> tag to mark the position of the image within the text
                text_content.append('<img src="'+os.path.join(images_folder, saved_file)+'" />')
            else:
                print >> sys.stderr, "error saving image on page", page_number, lt_obj.__repr__
        elif isinstance(lt_obj, LTFigure):
            # LTFigure objects are containers for other LT* objects, so recurse through the children
            text_content.append(parse_lt_objs(lt_obj, page_number, images_folder, text_content))

    for k, v in sorted([(key,value) for (key,value) in page_text.items()]):
        # sort the page_text hash by the keys (x0,x1 values of the bbox),
        # which produces a top-down, left-to-right sequence of related columns
        text_content.append(''.join(v))

    return '\n'.join(text_content)


###
### Processing Pages
###

def _parse_pages (doc, images_folder):
    """With an open PDFDocument object, get the pages and parse each one
    [this is a higher-order function to be passed to with_pdf()]"""
    rsrcmgr = PDFResourceManager()
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    text_content = []
    for i, page in enumerate(PDFPage.create_pages(doc)):
        interpreter.process_page(page)
        # receive the LTPage object for this page
        layout = device.get_result()
        # layout is an LTPage object which may contain child objects like LTTextBox, LTFigure, LTImage, etc.
        text_content.append(parse_lt_objs(layout, (i+1), images_folder))

    return text_content

def get_pages (pdf_doc, pdf_pwd='', images_folder='/tmp'):
    """Process each of the pages in this pdf file and return a list of strings representing the text found in each page"""
    return with_pdf(pdf_doc, _parse_pages, pdf_pwd, *tuple([images_folder]))

In [13]:
# Code for HTML extraction
# From https://gist.github.com/zross/10298077
def convert_pdf_to_html(path):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    codec = 'utf-8'
    laparams = LAParams()
    device = HTMLConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
    fp = file(path, 'rb')
    interpreter = PDFPageInterpreter(rsrcmgr, device)
    password = ""
    maxpages = 0 #is for all
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page)
    fp.close()
    device.close()
    str = retstr.getvalue()
    retstr.close()
    
    return str

## Choose the PDF file

In [5]:
# PDF files live in the ./PDFs directory
# list the files in that directory
import glob
print(glob.glob('./PDFs/*.pdf'))

['./PDFs/4612801 Supplier Spec.pdf', './PDFs/Aquamin Spec Sheet.pdf', './PDFs/Banko-and-Brill-Scaling-to-Large-Corpora.pdf', './PDFs/Booz Allen Field Guide to Data Science 2015.pdf', './PDFs/Building Machines that Think Like Humans - Survey of AI.pdf', './PDFs/Cognitive Reflection Test - Shane Frederick.pdf', './PDFs/data.pdf', './PDFs/OG GUAR 3500 F-D [PDS].pdf', './PDFs/the-new-artificial-intelligence-market.pdf', './PDFs/Unskilled and Unaware - Kruger and Dunning.pdf']


In [58]:
# Get the first file from the list
file_path = os.getcwd() + '/PDFs/'
file_name1 = file_path + 'OG GUAR 3500 F-D [PDS].pdf'
file_name2 = file_path + '4612801 Supplier Spec.pdf'
file_name3 = file_path + 'Aquamin Spec Sheet.pdf'
file_name4 = file_path + 'TruMarine Spec Sheet.pdf'
file_name5 = file_path + 'Spec Sheet (4612800).pdf'
file_name6 = file_path + 'B6_Pyridoxine Hydrochloride_Specs and Testing.pdf'
file_name7 = file_path + 'B3_Niacinamide_Specs and Testing.pdf'


file_name8 = file_path + 'Cognitive Reflection Test - Shane Frederick.pdf'

## Test PDFMiner

In [69]:
# Get the table of contents of the selected PDF
get_toc(file_name5)

[]

In [70]:
get_pages(file_name5, image_dir)

error saving image on page 1 <bound method LTImage.__repr__ of <LTImage(img0) 270.000,720.000,336.000,780.000 (192, 169)>>


['\nPrinted:\n\n5/30/2013\n\nTests\nAPPEARANCE\nPARTICLE SIZE\nPARTICLE SIZE\nBULK DENSITY\nIDENTIFICATION\nIDENTIFICATION 2\nIDENTIFICATION 3\nSPECIFIC ROTATION\nSPECIFIC ROTATION\nSPECIFIC ROTATION\nLOSS ON DRYING\nRESIDUE ON IGNITION\nHEAVY METALS\nLEAD\nARSENIC\nCADMIUM\nLIMIT OF LUMIFLAVIN\nRESIDUAL SOLVENTS <467>\nASSAY (ON DRIED BASIS)\nMICROBIOLOGICAL LIMITS\nTOTAL PLATE COUNT\nMOLD & YEAST\nCOLIFORMS\nE. COLI (MPN/G)\nSALMONELLA (/25G)\nPSEUDOMONAS\nS. AUREUS\nTAPPED DENSITY\n\n"Sourcing & Supplying Quality Products Worldwide"\nTECHNICAL DATA SHEET\nRIBOFLAVIN USP (VITAMIN B-2)   \nAceto Product Code#:4612800\n\nAceto Corporation\n4 Tri Harbor Court\nPort Washington, NY 11050\nPhone: (516) 627-6000\nFax: (516) 627-6093\nWebsite: www.aceto.com\nTECHNICAL DATA SHEET\nRIBOFLAVIN USP (VITAMIN B-2)   \nAceto Product Code#:4612800\nCAS#:83-88-5\n\nSpecification\nORANGE YELLOW CRYSTALLINE PDR\nNLT 90% THROUGH AN 100 MESH\n100%  THROUGH AN 80 MESH\nCa 250 - 300 G/L\nPASS\nPASS (JP)\nP

## Test PDFMiner HTML Conversion

In [71]:
convert_pdf_to_html(file_name5)

'<html><head>\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n</head><body>\n<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>\n<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>\n<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:259px; top:120px; width:86px; height:31px;"><span style="font-family: AAAAAA+Arial; font-size:14px">Aceto Corporation\n<br></span><span style="font-family: AAAAAA+Arial; font-size:13px">4 Tri Harbor Court\n<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:241px; top:151px; width:122px; height:13px;"><span style="font-family: AAAAAA+Arial; font-size:13px">Port Washington, NY 11050\n<br></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:252px; top:164px; width:101px; height:25px;"><span style="font-family: AAAAAA+Arial; font-size:13px

### Summary

PDFMiner is quirky to install but once set up, it does extract the table of contents and the text from a PDF. The text is raw and requires further regexing to extract the required information.

Have not experimented with PDFMiner's layout parsing capability -- need to explore that next.

## Test TabulaPy for Extracting Tables from PDFs

In [62]:
# Install TabulaPy (https://github.com/chezou/tabula-py)
from tabula import read_pdf

In [72]:
read_pdf(file_name5)

Works well if the tables are explicit in the PDF -- not so well otherwise. Even when tables are clear in the PDF, only some of the tables get pulled out.

## Try pdftables

pdftables requires poppler and cairo -- packages that I haven't been able to install correctly on my system yet.

In [7]:
fileobj = open(file_name, 'rb')

In [8]:
from pdftables.pdf_document import PDFDocument
doc = PDFDocument.from_fileobj(fileobj)

ImportError: No module named pdftables.pdf_document

## Extract Tables Using BeautifulSoup

In [53]:
# From http://srome.github.io/Parsing-HTML-Tables-in-Python-with-BeautifulSoup-and-pandas/
import requests
import pandas as pd
from bs4 import BeautifulSoup
    
class HTMLTableParser:
       
        def parse_url(self, url):
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'lxml')
            return [(table['id'],self.parse_html_table(table))\
                    for table in soup.find_all('table')]  
    
        def parse_html_table(self, table):
            n_columns = 0
            n_rows=0
            column_names = []
    
            # Find number of rows and columns
            # we also find the column titles if we can
            for row in table.find_all('tr'):
                
                # Determine the number of rows in the table
                td_tags = row.find_all('td')
                if len(td_tags) > 0:
                    n_rows+=1
                    if n_columns == 0:
                        # Set the number of columns for our table
                        n_columns = len(td_tags)
                        
                # Handle column names if we find them
                th_tags = row.find_all('th') 
                if len(th_tags) > 0 and len(column_names) == 0:
                    for th in th_tags:
                        column_names.append(th.get_text())
    
            # Safeguard on Column Titles
            if len(column_names) > 0 and len(column_names) != n_columns:
                raise Exception("Column titles do not match the number of columns")
    
            columns = column_names if len(column_names) > 0 else range(0,n_columns)
            df = pd.DataFrame(columns = columns,
                              index= range(0,n_rows))
            row_marker = 0
            for row in table.find_all('tr'):
                column_marker = 0
                columns = row.find_all('td')
                for column in columns:
                    df.iat[row_marker,column_marker] = column.get_text()
                    column_marker += 1
                if len(columns) > 0:
                    row_marker += 1
                    
            # Convert to float if possible
            for col in df:
                try:
                    df[col] = df[col].astype(float)
                except ValueError:
                    pass
            
            return df

In [74]:
soup = BeautifulSoup(convert_pdf_to_html(file_name5), 'lxml')
soup

<html><head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
</head><body>
<span style="position:absolute; border: gray 1px solid; left:0px; top:50px; width:612px; height:792px;"></span>
<div style="position:absolute; top:50px;"><a name="1">Page 1</a></div>
<div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:259px; top:120px; width:86px; height:31px;"><span style="font-family: AAAAAA+Arial; font-size:14px">Aceto Corporation
<br/></span><span style="font-family: AAAAAA+Arial; font-size:13px">4 Tri Harbor Court
<br/></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:241px; top:151px; width:122px; height:13px;"><span style="font-family: AAAAAA+Arial; font-size:13px">Port Washington, NY 11050
<br/></span></div><div style="position:absolute; border: textbox 1px solid; writing-mode:lr-tb; left:252px; top:164px; width:101px; height:25px;"><span style="font-family: AAAAAA+Arial; font-size:13px">Pho

In [66]:
tables = soup.find_all('table')[0]

IndexError: list index out of range

In [65]:
tables

[]

In [68]:
# From https://github.com/liberit/scraptils/blob/master/scraptils/tools/pdf2csv.py

# converts a pdf into a csv file

#from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.layout import LAParams, LTRect
from pdfminer.converter import PDFPageAggregator
from itertools import islice
import sys, csv, cStringIO, codecs
from pbs import pdftotext

class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    src: http://docs.python.org/library/csv.html#writer-objects
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = codecs.getincrementalencoder(encoding)()

    def writerow(self, row):
        self.writer.writerow([s.encode("utf-8") if isinstance(s, basestring) else s
                              for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


def pdf2csv(pdf):
    fp = open(pdf, 'rb')
    parser = PDFParser(fp)
    doc = PDFDocument()
    parser.set_document(doc)
    doc.set_parser(parser)
    # Supply the password for initialization.
    # (If no password is set, give an empty string.)
    doc.initialize('')
    rsrcmgr = PDFResourceManager()
    # Set parameters for analysis.
    laparams = LAParams()
    device = PDFPageAggregator(rsrcmgr, laparams=laparams)
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    writer = UnicodeWriter(sys.stdout)
    for pageno, page in enumerate(doc.get_pages()):
        interpreter.process_page(page)
        layout = device.get_result()
        hlines=[]
        vlines=[]
        for i in layout:
            if not type(i) == LTRect: continue
            hlines.append(int(i.x0))
            hlines.append(int(i.x1))
            vlines.append(int(layout.height - i.y0))
            vlines.append(int(layout.height - i.y1))
        hlines=filterclose(sorted(set(hlines)))
        vlines=filterclose(sorted(set(vlines)))
        i=0
        while(i<len(vlines)-1):
            if not vlines[i+1]-vlines[i]>10:
                i=i+1
                continue
            j=0
            row=[]
            while(j<len(hlines)-1):
                if not hlines[j+1]-hlines[j]>10:
                    j=j+1
                    continue
                row.append(' '.join(get_region(pdf,
                                               pageno+1,
                                               hlines[j]+1,
                                               vlines[i],
                                               hlines[j+1]-1,
                                               vlines[i+1]).split()))
                j=j+1
            writer.writerow(row)
            i=i+1
    fp.close()

def filterclose(lst):
    tmp=[lst[0]]
    for elem in islice(lst, 1, None):
        if elem - 2 > tmp[-1]:
            tmp.append(elem)
    return tmp

def get_region(pdf, page, x1,y1,x2,y2):
    # this is an extremely ugly hack. should be reimplemented with
    # some poppler like lib, which itself only supports getting
    # "selected" text, having some different logic than the
    # simple one used in pdftotext
    return pdftotext('-nopgbrk',
                     '-f', page,
                     '-l', page,
                     '-x', x1,
                     '-y', y1,
                     '-H', abs(y2-y1),
                     '-W', abs(x2-x1),
                     pdf,
                     '-'
                    )

ImportError: No module named pbs

In [76]:
for page in PDFPage.create_pages(file_name1):
    interpreter.process_page(page)
    # receive the LTPage object for the page.
    layout = device.get_result()

AttributeError: 'str' object has no attribute 'catalog'

In [75]:
for lt_obj in layout:
    print(lt_obj.__class__.__name__)

NameError: name 'layout' is not defined

In [78]:
# Using the parsed object tree from PDFMiner
# From Matt Swain
# http://stackoverflow.com/questions/25248140/how-does-one-obtain-the-location-of-text-in-a-pdf-with-pdfminer

from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTFigure


def parse_layout(layout):
    """Function to recursively parse the layout tree."""
    for lt_obj in layout:
        print(lt_obj.__class__.__name__)
        print(lt_obj.bbox)
        if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
            print(lt_obj.get_text())
        elif isinstance(lt_obj, LTFigure):
            parse_layout(lt_obj)  # Recursive


fp = open(file_name2, 'rb')
parser = PDFParser(fp)
doc = PDFDocument(parser)

rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)
    layout = device.get_result()
    parse_layout(layout)

LTTextBoxHorizontal
(264.6, 689.9617999999999, 351.23425, 721.27925)
Aceto Corporation
4 Tri Harbor Court

LTTextBoxHorizontal
(246.8, 677.2618, 368.89075, 690.37215)
Port Washington, NY 11050

LTTextBoxHorizontal
(257.2, 651.8618, 358.4974, 677.67215)
Phone: (516) 627-6000
Fax: (516) 627-6093

LTTextBoxHorizontal
(252.6, 639.1618, 363.16625, 652.27215)
Website: www.aceto.com

LTTextBoxHorizontal
(209.05, 625.0776, 407.10169999999994, 636.9947)
"Sourcing & Supplying Quality Products Worldwide"

LTTextBoxHorizontal
(222.6, 578.824, 387.1855, 617.9973)
TECHNICAL DATA SHEET
RIBOFLAVIN (VITAMIN B-2)   

LTTextBoxHorizontal
(243.6, 563.3526, 365.9912500000001, 575.26505)
Aceto Product Code#:4612801

LTTextBoxHorizontal
(276.6, 552.1526, 333.3072, 564.0650499999999)
CAS#:83-88-5

LTTextBoxHorizontal
(113.05, 279.13280000000003, 242.69050000000007, 553.04345)
Tests
APPEARANCE
IDENTIFICATION
SPECIFIC ROTATION, °
LOSS ON DRYING, %
RESIDUE ON IGNITION, %
LUMIFLAVIN (440nm Absorbances)
ASSAY (on 