In [1]:
# needed packages
import fitz
import sys
from collections import Counter

In [2]:
def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

In [3]:
def get_styles(doc):
    style_counts = []

    for page in doc:
        #, flags=11

        paths = page.get_drawings()  # get drawings on the page

        drawn_lines = []
        for p in paths:
            # print(p)
            for item in p["items"]:
                # print(item[0])
                if item[0] == "l":  # an actual line
                    # print(item[1], item[2])
                    p1, p2 = item[1], item[2]
                    if p1.y == p2.y:
                        drawn_lines.append((p1, p2))
                elif item[0] == "re":  # a rectangle: check if height is small
                    # print(item[0])
                    # print(item[1])
                    r = item[1]
                    if r.width > r.height and r.height <= 2:
                        drawn_lines.append((r.tl, r.tr))  # take top left / right points
        
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                for s in l["spans"]:  # iterate through the text spans
                    
                    font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                        s["font"],  # font name
                        flags_decomposer(s["flags"]),  # readable font flags
                        s["size"],  # font size
                        s["color"],  # font color
                    )

                    r = fitz.Rect(s['bbox']) 
                    for p1, p2 in drawn_lines:  # check distances for start / end points
                        if abs(r.bl - p1) <= 4 and abs(r.br - p2) <= 4:
                            font_properties = " ".join([font_properties, 'underlined'])

                    style_counts.append(font_properties)
    styles = dict(Counter(style_counts))

    style_list = sorted(styles.items(), key=lambda x:x[1], reverse=True)
    
    return style_list

In [4]:
def get_opinion(doc, style_list):

    headers = {}
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                for s in l['spans']:
                    if s['size'] >= p_size:
                        texts = "".join ([texts, s['text']])
                text_list = texts.split()
                if len(text_list) > 0 and len(text_list) < 7:
                    headers.update({texts:count})

    opinion_loc = headers['Opinion']
    return opinion_loc

In [5]:
def get_majors(doc, style_list, opinion_loc):
    
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))
    new_headers = {}
    header_properties = ""

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                count+=1
                if count==opinion_loc:
                    for s in l['spans']:
                        header_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                            s["font"],  # font name
                            flags_decomposer(s["flags"]),  # readable font flags
                            s["size"],  # font size
                            s["color"],  # font color
                        )

    count = 0                
    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                count+=1
                for s in l['spans']:
                    font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                        s["font"],  # font name
                        flags_decomposer(s["flags"]),  # readable font flags
                        s["size"],  # font size
                        s["color"],  # font color
                    )
                    if font_properties==header_properties:
                        new_headers.update({s['text']:count})

    return new_headers

In [6]:
def get_master(style_list):

    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))
    p_color = style_list[0][0].split('color')[1].split()[0].strip(',')
    p_font = style_list[0][0]

    bad_fonts = []

    for style in style_list:
        font_str = style[0]
        s_size = int(font_str.split('size')[1].split()[0].strip(','))
        s_color = font_str.split('color')[1].split()[0].strip(',')

        # if font matches paragraph font, it's a bad_font
        if font_str==p_font:
            bad_fonts+=[font_str]
        # if font doesn't match paragraph text color, it's a bad_font
        if s_color!=p_color:
            bad_fonts+=[font_str]
        # if font matches characteristics of vocab word font, it's a bad font
        if ('bold' in font_str and 'underlined' in font_str) and ('italic' in font_str and p_size==s_size):
            bad_fonts+=[font_str]
        # if font size is smaller than paragraph text size, it's a bad_font
        if s_size<p_size:
            bad_fonts+=[font_str]

    master = []
    for style in style_list:
        if style[0] not in bad_fonts:
            master += [style[0]]

    return master

In [7]:
def get_subheaders(doc, style_list, opinion_loc, master):

    for page in doc:

        paths = page.get_drawings()  # get drawings on the page

        drawn_lines = []
        for p in paths:
            # print(p)
            for item in p["items"]:
                # print(item[0])
                if item[0] == "l":  # an actual line
                    # print(item[1], item[2])
                    p1, p2 = item[1], item[2]
                    if p1.y == p2.y:
                        drawn_lines.append((p1, p2))
                elif item[0] == "re":  # a rectangle: check if height is small
                    # print(item[0])
                    # print(item[1])
                    r = item[1]
                    if r.width > r.height and r.height <= 2:
                        drawn_lines.append((r.tl, r.tr))  # take top left / right points

    count = 0
    opinion_subheaders = {}
    p_color = style_list[0][0].split('color')[1].split()[0].strip(',')

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                span_fonts = []
                if count>=opinion_loc:
                    for s in l['spans']:
                        font_properties = "Font: '%s' (%s), size %g, color #%06x" % (
                            s["font"],  # font name
                            flags_decomposer(s["flags"]),  # readable font flags
                            s["size"],  # font size
                            s["color"],  # font color
                        )

                        r = fitz.Rect(s['bbox']) 
                        for p1, p2 in drawn_lines:  # check distances for start / end points
                            if abs(r.bl - p1) <= 4 and abs(r.br - p2) <= 4:
                                font_properties = " ".join([font_properties, 'underlined'])
                    
                        span_fonts+=[font_properties]
                        texts = "".join ([texts, s['text']])
                
                text_list = texts.split()
                if len(text_list) > 0 and len(text_list) < 7:
                    if any(i in span_fonts for i in master):
                        opinion_subheaders.update({texts:count})
                    if texts.isupper()==True:
                        opinion_subheaders.update({texts:count})
                    
    return opinion_subheaders

In [9]:
doc = fitz.open('pplvpv.pdf')
style_list = get_styles(doc)
opinion_loc = get_opinion(doc, style_list)
master = get_master(style_list)
opinion_subheaders = get_subheaders(doc, style_list, opinion_loc, master)
print(opinion_subheaders)

{'Opinion': 26, ' [**498]  [*345] Toko Serita, J.': 27, ' [**499] Relevant Laws': 45, '(CPL 440.10 [6]).': 98, ' [**501]  [*350] 1. Findings of Fact': 141, ' [*351] 2. Conclusions of Law': 172, 'from a young [****5]  age.17': 202, ' [***20] ': 365}


In [20]:
    keys_as_list = list(opinion_subheaders)
    for header_index in range(len(keys_as_list)):
        header = keys_as_list[header_index]
        if 'conclusion' in header.lower():
            print(opinion_subheaders[header])

172


In [29]:
def get_narrative(doc, style_list, opinion_loc, opinion_subheaders):

    narrative = ""
    conclusion_loc = 100000
    count = 0
    p_size = int(style_list[0][0].split('size')[1].split()[0].strip(','))

    keys_as_list = list(opinion_subheaders)
    for header_index in range(len(keys_as_list)):
        header = keys_as_list[header_index]
        if 'conclusion' in header.lower():
            conclusion_loc = opinion_subheaders[header]

    for page in doc:
        #, flags=11
        blocks = page.get_text("dict", flags=11)["blocks"]

        for b in blocks:  # iterate through the text blocks
            for l in b["lines"]:  # iterate through the text lines
                texts = ""
                count+=1
                if count>=opinion_loc and count < conclusion_loc:
                    for s in l['spans']:
                        if s['size'] == p_size:
                            texts = "".join ([texts, s['text']])

                narrative = " ".join([narrative, texts])

                    
    return narrative

In [30]:
get_narrative(doc, style_list, opinion_loc, opinion_subheaders)

'                   DECISION AND ORDER Petitioner Anthony Thompson seeks relief pursuant to 28 U.S.C. § 2255. On March 8, 2017, the Court sentenced  Petitioner to 188 months\' imprisonment following his plea of guilty to one count of sex trafficking of a minor, in  violation of 18 U.S.C. § 1591(a)(1) and (b)(2). Petitioner\'s appeal is currently pending before the Second Circuit.  See Second Circuit Docket No. 17-822. "[T]here is no jurisdictional bar to a district court\'s adjudication of a § 2255 motion during the pendency of a direct  appeal." United States v. Outen, 286 F.3d 622, 632 (2d Cir. 2002) (emphasis omitted). However, "district courts in  this Circuit have generally denied without prejudice as premature those § 2255 motions that are filed during the  pendency of a direct appeal." Rivera v. United States, 16-CV-5238(KMW), 13-CR-424-1(KMW), 2016 U.S. Dist.  LEXIS 189643, 2016 WL 9022576, at *1 (S.D.N.Y. Aug. 24, 2016) (collecting cases). This practice arises out of  consider

In [21]:
doc = fitz.open('statevbraun.pdf')
style_list = get_styles(doc)
opinion_loc = get_opinion(doc, style_list)
master = get_master(style_list)
opinion_subheaders = get_subheaders(doc, style_list, opinion_loc, master)
get_narrative(doc, style_list, opinion_loc, opinion_subheaders)

"                                                  ¶1  [*759]  [**887] F, J. — [S]ex traffickers select victims who demonstrate  vulnerabilities including homelessness, substance  abuse, mental health issues, and histories of  physical, emotional or sexual abuse. A typical  trafficker recruits victims by telling them that he  loves them, promising them a better life, providing  them with shelter and drugs, and lying to them  about the nature of the job. … … . [T]raffickers control their victims through physical  violence, sexual violence, psychological violence  and grooming. Traffickers … groom victims with  promises and compliments, but escalate to physical  abuse, sexual assault and death threats. … They  also use psychological violence such as tearing a  victim [***2]  down, telling them they are worthless,  socially isolating them, and controlling them  financially and by taking advantage of a victim's  drug dependency. … … [V]ictims often stay with their traffickers—or  leave and

In [22]:
doc = fitz.open('statevward.pdf')
style_list = get_styles(doc)
opinion_loc = get_opinion(doc, style_list)
master = get_master(style_list)
opinion_subheaders = get_subheaders(doc, style_list, opinion_loc, master)
get_narrative(doc, style_list, opinion_loc, opinion_subheaders)

'                                                      A Madison County jury convicted the defendant, Randall  Ray Ward, of two counts of promoting prostitution and  one count of trafficking a person for a commercial sex  act. Following a sentencing hearing, the trial court  imposed an effective sentence of twenty years in  confinement. On appeal, the defendant challenges the  sufficiency of the evidence to support his convictions.  He also argues the trial court erred in failing to merge  the convictions regarding S.C. and in failing to [*2]   give the jury an instruction on accomplice testimony.  After reviewing the record and considering the  applicable law, we affirm the defendant\'s convictions but  remand for merger of Counts three and four. OPINION Facts and Procedural History A Madison County grand jury indicted the defendant,  Randall Ray Ward, for two counts of trafficking a  person for a commercial sex act (Counts one and three)  and two counts of promoting prostitution (Cou

In [31]:
doc = fitz.open('tompvus.pdf')
style_list = get_styles(doc)
opinion_loc = get_opinion(doc, style_list)
master = get_master(style_list)
opinion_subheaders = get_subheaders(doc, style_list, opinion_loc, master)
get_narrative(doc, style_list, opinion_loc, opinion_subheaders)

'                   DECISION AND ORDER Petitioner Anthony Thompson seeks relief pursuant to 28 U.S.C. § 2255. On March 8, 2017, the Court sentenced  Petitioner to 188 months\' imprisonment following his plea of guilty to one count of sex trafficking of a minor, in  violation of 18 U.S.C. § 1591(a)(1) and (b)(2). Petitioner\'s appeal is currently pending before the Second Circuit.  See Second Circuit Docket No. 17-822. "[T]here is no jurisdictional bar to a district court\'s adjudication of a § 2255 motion during the pendency of a direct  appeal." United States v. Outen, 286 F.3d 622, 632 (2d Cir. 2002) (emphasis omitted). However, "district courts in  this Circuit have generally denied without prejudice as premature those § 2255 motions that are filed during the  pendency of a direct appeal." Rivera v. United States, 16-CV-5238(KMW), 13-CR-424-1(KMW), 2016 U.S. Dist.  LEXIS 189643, 2016 WL 9022576, at *1 (S.D.N.Y. Aug. 24, 2016) (collecting cases). This practice arises out of  consider

In [32]:
doc = fitz.open('usvbell.pdf')
style_list = get_styles(doc)
opinion_loc = get_opinion(doc, style_list)
master = get_master(style_list)
opinion_subheaders = get_subheaders(doc, style_list, opinion_loc, master)
get_narrative(doc, style_list, opinion_loc, opinion_subheaders)

