In [1]:
# needed packages
import fitz
import sys
from collections import Counter

In [2]:
def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

In [3]:
def get_narrative(pdf):
    doc = fitz.open(pdf)
    
    style_counts = []

    for page in doc:
        #, flags=11

        paths = page.get_drawings()  # get drawings on the page

        drawn_lines = []
        for p in paths:
            # print(p)
            for item in p["items"]:
                # print(item[0])
                if item[0] == "l":  # an actual line
                    # print(item[1], item[2])
                    p1, p2 = item[1], item[2]
                    if p1.y == p2.y:
                        drawn_lines.append((p1, p2))
                elif item[0] == "re":  # a rectangle: check if height is small
                    # print(item[0])
                    # print(item[1])
                    r = item[1]
                    if r.width > r.height and r.height <= 2:
                        drawn_lines.append((r.tl, r.tr))  # take top left / right points
        
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans
                    
                    font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                        s["font"],  # font name
                        flags_decomposer(s["flags"]),  # readable font flags
                        s["size"],  # font size
                        s["color"],  # font color
                    )

                    r = fitz.Rect(s['bbox']) 
                    for p1, p2 in drawn_lines:  # check distances for start / end points
                        if abs(r.bl - p1) <= 4 and abs(r.br - p2) <= 4:
                            font_properties = " ".join([font_properties, 'underlined'])

                    style_counts.append(font_properties)
    
    styles = dict(Counter(style_counts))

    style_list = sorted(styles.items(), key=lambda x:x[1], reverse=True)

    headers = {}
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                for s in l['spans']:
                    if s['size'] >= p_size:
                        texts = "".join ([texts, s['text']])
                text_list = texts.split()
                if len(text_list) > 0 and len(text_list) < 7:
                    headers.update({texts:count})

    opinion_loc = headers['Opinion']
    
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))
    new_headers = {}
    header_properties = ""

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                count+=1
                if count==opinion_loc:
                    for s in l['spans']:
                        header_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                            s["font"],  # font name
                            flags_decomposer(s["flags"]),  # readable font flags
                            s["size"],  # font size
                            s["color"],  # font color
                        )

    count = 0                
    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                count+=1
                for s in l['spans']:
                    font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                        s["font"],  # font name
                        flags_decomposer(s["flags"]),  # readable font flags
                        s["size"],  # font size
                        s["color"],  # font color
                    )
                    if font_properties==header_properties:
                        new_headers.update({s['text']:count})

    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))
    p_color = style_list[0][0].split('color')[1].split()[0].strip(',')
    p_font = style_list[0][0]

    bad_fonts = []

    for style in style_list:
        font_str = style[0]
        s_size = int(font_str.split('size')[1].split()[0].strip(','))
        s_color = font_str.split('color')[1].split()[0].strip(',')

        # if font matches paragraph font, it's a bad_font
        if font_str==p_font:
            bad_fonts+=[font_str]
        # if font doesn't match paragraph text color, it's a bad_font
        if s_color!=p_color:
            bad_fonts+=[font_str]
        # if font matches characteristics of vocab word font, it's a bad font
        if ('bold' in font_str and 'underlined' in font_str) and ('italic' in font_str and p_size==s_size):
            bad_fonts+=[font_str]
        # if font size is smaller than paragraph text size, it's a bad_font
        if s_size<p_size:
            bad_fonts+=[font_str]

    master = []
    for style in style_list:
        if style[0] not in bad_fonts:
            master += [style[0]]

    for page in doc:

        paths = page.get_drawings()  # get drawings on the page

        drawn_lines = []
        for p in paths:
            # print(p)
            for item in p["items"]:
                # print(item[0])
                if item[0] == "l":  # an actual line
                    # print(item[1], item[2])
                    p1, p2 = item[1], item[2]
                    if p1.y == p2.y:
                        drawn_lines.append((p1, p2))
                elif item[0] == "re":  # a rectangle: check if height is small
                    # print(item[0])
                    # print(item[1])
                    r = item[1]
                    if r.width > r.height and r.height <= 2:
                        drawn_lines.append((r.tl, r.tr))  # take top left / right points

    count = 0
    opinion_subheaders = {}
    p_color = style_list[0][0].split('color')[1].split()[0].strip(',')

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                span_fonts = []
                if count>=opinion_loc:
                    for s in l['spans']:
                        font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                            s["font"],  # font name
                            flags_decomposer(s["flags"]),  # readable font flags
                            s["size"],  # font size
                            s["color"],  # font color
                        )

                        r = fitz.Rect(s['bbox']) 
                        for p1, p2 in drawn_lines:  # check distances for start / end points
                            if abs(r.bl - p1) <= 4 and abs(r.br - p2) <= 4:
                                font_properties = " ".join([font_properties, 'underlined'])
                    
                        span_fonts+=[font_properties]
                        texts = "".join ([texts, s['text']])
                
                text_list = texts.split()
                if len(text_list) > 0 and len(text_list) < 7:
                    if any(i in span_fonts for i in master):
                        opinion_subheaders.update({texts:count})
                    if texts.isupper()==True:
                        opinion_subheaders.update({texts:count})

    narrative = ""
    conclusion_loc = 100000
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))

    keys_as_list = list(opinion_subheaders)
    for header_index in range(len(keys_as_list)):
        header = keys_as_list[header_index]
        if 'conclusion' in header.lower():
            conclusion_loc = opinion_subheaders[header]

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                if count>=opinion_loc and count < conclusion_loc:
                    for s in l['spans']:
                        if s['size'] == p_size:
                            texts = "".join ([texts, s['text']])

                narrative = " ".join([narrative, texts])

                    
    return narrative

In [4]:
import glob
import pandas as pd

In [5]:
df = pd.DataFrame({"CaseName":[], "Narrative":[]})

In [7]:
mypath = "cases"

pdf_files = glob.glob("%s/*.pdf" % mypath)

narratives = []

for file in glob.glob(mypath + "/*.pdf"):
    
    narrative = get_narrative(file)
    df.loc[len(df)] = {"CaseName":file, "Narrative":narrative}

In [8]:
df

Unnamed: 0,CaseName,Narrative
0,"cases\A.D. v. Best Western Int'l, Inc., 2023 U...",OPINION AND ORDER This ...
1,"cases\A.D. v. Choice Hotels Int'l, Inc., 2023 ...",OPINION AND ORDER This ...
2,"cases\D.B. v. IE Hotel Grp., LLC, 2023 U.S. Di...",OPINION AND ORDER G...
3,"cases\D.H. v. Tucker Inn Inc., 2023 U.S. Dist....",ORDER This matter is be...
4,"cases\People v. Calhoun, 38 Cal. App. 5th 275.pdf",...
5,"cases\People v. Mahjoob, 2022 Cal. App. Unpub....",Page 2 of 17 ...
6,"cases\People v. Pitcher, 2017 Cal. App. Unpub....",Brannon Lawrence...
7,"cases\Samsung Fire & Marine Ins. Co., Ltd. v. ...",...
8,"cases\United States v. Adams, 578 F. Supp. 3d ...",[*692] MEMORANDUM Defen...
9,"cases\United States v. Bixler, 2022 U.S. App. ...",...
