A pipeline to read briefs from pdf, preprocess them, extract the arguments from the table of contents, and split the brief into sections

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pycryptodome pypdf2 fuzzywuzzy openai



In [None]:
import os
import PyPDF2
import pandas as pd
from Crypto.Cipher import AES
from PyPDF2.errors import PdfReadError

def extract_with_pypdf2(pdf_path):
  try:
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)

        full_text = ""

        if pdf_reader.is_encrypted:
            # Attempt to decrypt it with an empty password
            try:
                pdf_reader.decrypt('')
            except Exception as e:
                return f"Failed to decrypt PDF: {e}"

        # Iterate through each page in the PDF
        for page in pdf_reader.pages:
            page_text = page.extract_text()

            # Append the page text to the full text of the document
            if page_text: 
                full_text += page_text + "\n"

    return full_text
  except (PdfReadError, TypeError )as e:
    print(f"PDF read error in {pdf_path}: {e}")
    return None

In [None]:
'''
data = []  # List to store file path and text data
directory = "/content/drive/MyDrive/LLM/brief_pdfs"
files = os.listdir(directory)
for filename in files:
  if filename.endswith(".pdf"):
    file_path = os.path.join(directory, filename)
    text = extract_with_pypdf2(file_path)
    if text is not None:  # Check if text extraction was successful
      data.append({'filename': filename, 'text': text})
pdf_df = pd.DataFrame(data)
'''

'\ndata = []  # List to store file path and text data\ndirectory = "/content/drive/MyDrive/LLM/brief_pdfs"\nfiles = os.listdir(directory)\nfor filename in files:\n  if filename.endswith(".pdf"):\n    file_path = os.path.join(directory, filename)\n    text = extract_with_pypdf2(file_path)\n    if text is not None:  # Check if text extraction was successful\n      data.append({\'filename\': filename, \'text\': text})\npdf_df = pd.DataFrame(data)\n'

In [None]:
# pdf_df = pd.DataFrame(data)

In [None]:
# print(len(data))

In [None]:
# print(pdf_df.head)

In [None]:
# pdf_df.to_csv('/content/drive/MyDrive/LLM/extracted_briefs_ckpt.csv', index=False)

In [None]:
import pandas as pd
toc_df = pd.read_csv('/content/drive/MyDrive/LLM/extracted_briefs_ckpt.csv')

In [None]:
import pandas as pd
import re

def extract_toc_and_rest(content):
    toc_pattern = r"TABLE OF CONTENTS"
    toa_pattern = r"TABLE OF AUTHORITIES(?![ .]{2,})"
    # toa_pattern = r"TABLE OF AUTHORITIES"
    conclusion_pattern = r"(CONCLUSION|Conclusion)"

    def extract_text(start_pattern, end_pattern, content):
        start_indices = [m.start() for m in re.finditer(start_pattern, content)]
        end_indices = [m.start() for m in re.finditer(end_pattern, content)]

         # Check if there is at least one start index and one end index
        if not start_indices or not end_indices:
            return None, None

        # Use the first start index
        start_index = start_indices[0]

        # Use the last end index, ensuring it is after the start index
        end_index = next((i for i in reversed(end_indices) if i > start_index), None)

        if end_index is not None:
            return content[start_index:end_index], end_index
        else:
            return content[start_index:], len(content)

    toc, toc_end_index = extract_text(toc_pattern, toa_pattern, content)

    if toc is None or len(toc.strip()) <= len('TABLE OF CONTENTS PagePage'):
        toc, toc_end_index = extract_text(toc_pattern, conclusion_pattern, content)

    rest_of_content = content[toc_end_index:] if toc_end_index is not None else None

    return toc, rest_of_content

In [None]:
# Remove briefs that where text is null bc of pdf reading issues or they are very short
# usually because they are not actually briefs or were read improperly
old_len = len(toc_df)
toc_df = toc_df[(toc_df['text'].notnull()) & (toc_df['text'].str.len() >= 15000)]
toc_df = toc_df.reset_index(drop=True)

print(f"Dropped {old_len - len(toc_df)} rows of empty or very short text")

Dropped 169 rows of empty or very short text


In [None]:
import re

# Function to split brief text into the TOC and Content by finding and slicing off everything after Conclusion
def split_text(text):
  conclusion_match = re.search(r'CONCLUSIONS?\b', text, flags=re.MULTILINE | re.IGNORECASE)
  if conclusion_match:
      # If a match is found, slice after conclusion
      # print("Here is the match for conclusion")
      # print(conclusion_match)
      toc_text = text[:conclusion_match.start()]
      content_text = text[conclusion_match.end():]
      return toc_text, content_text
  else:
    # toa_match = re.search(r'^Table of Authorities\b', text, flags=re.MULTILINE | re.IGNORECASE)

    toa_match = re.search(r'^Table\s+of\s+Authorities\b', text, flags=re.MULTILINE | re.IGNORECASE)
    if not toa_match:
      # If still no match, look for any line containing "Authorities"
      toa_match = re.search(r'^.*?\bAuthorities\b.*$', text, flags=re.MULTILINE | re.IGNORECASE)


    if toa_match:
      # If a match for "Table of Authorities" is found, split the text at that point
      toc_text = text[:toa_match.start()]
      content_text = text[toa_match.end():]
      return toc_text, content_text
    return None, None

In [None]:
# Apply the extract_toc_and_rest function to the 'text' field and store the results in new columns
toc_df[['toc', 'content']] = toc_df.apply(lambda row: pd.Series(split_text(row['text'])), axis=1)

# Now, toc_df contains all the original fields, plus the 'toc' and 'content' columns with the extracted data
print("toc_df updated with 'toc' and 'content' columns.")

toc_df updated with 'toc' and 'content' columns.


In [None]:
def extract_docket_number(filename):
    match = re.search(r'Docket(\d+-\d+)_', filename)
    if match:
        return match.group(1)
    else:
        return None

toc_df[['docket_num']] = toc_df.apply(lambda row: pd.Series(extract_docket_number(row['filename'])), axis=1)

# Now, toc_df contains all the original fields, plus the 'toc' and 'content' columns with the extracted data
print("toc_df updated with docket number column.")

toc_df updated with docket number column.


In [None]:
toc_df['court'] = 'SCOTUS'

In [None]:
# Remove briefs that where toc or content is null
# usually because of an issue with text parsing
old_len = len(toc_df)
# & (toc_df['content'].str.len() >= 4000)
toc_df = toc_df[((toc_df['toc'].notnull()) | (toc_df['content'].notnull())) ]
toc_df = toc_df.reset_index(drop=True)

print(f"Dropped {old_len - len(toc_df)} rows of empty toc/content")

Dropped 11 rows of empty toc/content


In [None]:
# Remove briefs that where toc or content is null
# usually because of an issue with text parsing
# Based on manual testing below for content, 4000 is a tight bound and 10000 is a loose one for finding short content.
old_len = len(toc_df)
toc_df = toc_df[(toc_df['content'].str.len() >= 5000)]
toc_df = toc_df.reset_index(drop=True)

print(f"Dropped {old_len - len(toc_df)} rows of empty or very short content")

Dropped 213 rows of empty or very short content


In [None]:
print(toc_df.head)

<bound method NDFrame.head of                         filename  \
0     Docket20-5279_Brief007.pdf   
1     Docket20-5279_Brief008.pdf   
2     Docket20-5279_Brief009.pdf   
3     Docket20-5279_Brief010.pdf   
4      Docket20-828_Brief001.pdf   
...                          ...   
3974  Docket16-1027_Brief009.pdf   
3975  Docket16-1027_Brief010.pdf   
3976   Docket17-387_Brief001.pdf   
3977   Docket17-387_Brief002.pdf   
3978   Docket17-387_Brief003.pdf   

                                                   text  \
0     No. 20-5279  \n \nIN THE \nSupreme Court of th...   
1     No. 20-5279 \nIN THE \nSupreme Court of the Un...   
2      \n No. 20-5279  \nIn the Supreme Court of the...   
3      \n \n \n \n \n \nNo. 20-5279 \n \n In the Sup...   
...                                                 ...   
3974  No. 16-1027\nIn the Supreme Court of the Unite...   
3976   \n \nNo. 17 -387 \n \n \nIN THE \nSUPREME COU...   
3977   \n No. 17-387 \nIn the Supreme Court of the U...   
3978  

In [None]:
# toc_df.to_csv('/content/drive/MyDrive/LLM/extracted_briefs_ckpt_2.csv', index=False)

In [None]:
toc_df = pd.read_csv('/content/drive/MyDrive/LLM/extracted_briefs_ckpt_2.csv')

In [None]:
# Count number of unique cases
unique_ids = list(toc_df['docket_num'].unique())
print(f"Number of cases: {len(unique_ids)}")

Number of cases: 360


In [None]:
# Tokenize each entry and count tokens
toc_df['token_count'] = toc_df['text'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)

# Calculate the average number of tokens
average_tokens = toc_df['token_count'].mean()

print("Average number of tokens per entry:", average_tokens)

Average number of tokens per entry: 8956.003769791405


Preprocess the toc to extract only the argument headers

In [None]:
import re
def clean_table_of_contents(toc_text):
    # Stage 1: Find the arguments
    # Remove standalone page numbers without a period. Must do this before removing periods below.
    toc_text = re.sub(r'^\s*\d+\s*$', '', toc_text, flags=re.MULTILINE)
    # Attempt to find the start of the arguments to extract. This will likely fail on some number of cases
    pattern = r'(Arguments?|Reasons?\s+for).*\n?'
    matches = re.search(pattern, toc_text, re.MULTILINE | re.IGNORECASE)
    # print("Here are the matches")
    # print(matches)
    if not matches:
        # print("oopsie no matches")
        return None
    toc_text = toc_text[matches.end():]
    # print(f"After matching on Argument, the ToC looks like: \n{toc_text}")
    # Stage 2: Use re.search to find "CONCLUSION" on a line by itself, case-insensitive

    conclusion_match = re.search(r'CONCLUSIONS?\b', toc_text, flags=re.MULTILINE | re.IGNORECASE) #remove ^ character?
    if conclusion_match:
        # If a match is found, slice after conclusion
        # print("Here is the match for conclusion")
        # print(conclusion_match)
        toc_text = toc_text[:conclusion_match.start()]
        # print(f"After matching on Conclusion, the ToC looks like: \n{toc_text}")

    # Now split on the periods
    split_text = re.split(r'\.\s*\.\s*\.\s*.*$', toc_text, flags=re.MULTILINE)
    # Removing empty strings and None elements that might result from capturing groups in the split

    # Finally, iterate through each line and apply other preprocessing steps, mainly removing periods and roman numerals
    processed_text = []
    for index, text in enumerate(split_text):
      # text = re.sub(r'\.{2,}', ' ', text)  # Replace periods
      text = re.sub(r'\.\s*\.\s*\.\s*.*$', '', text, flags=re.MULTILINE) # Find any sequence of three periods, with any amount of space after them, and remove the rest of the line
      # text = re.sub(r'(\.\s){2,}.*$', '', text, flags=re.MULTILINE)

      # Remove lowercase Roman numerals at the end of lines, ensuring they're not part of section titles
      text = re.sub(r'^\s*(i{1,3}|iv|vi{0,3}|ix|xi{0,3}|xii{0,3}|xiii|xiv|xv)\s*[\.\s]*$', '', text, flags=re.MULTILINE)
      # Remove spaces before newlines
      text = re.sub(r'[ \t]+$', '', text, flags=re.MULTILINE)
      text = re.sub(r'\n+', ' ', text)
      text = text.strip()

      # Special condition for the first item in the list
      if index == 0:
        # Remove "Argument" followed by any punctuation or space at the start of the line
        text = re.sub(r'^Argument[\s.,;:!?-]*', '', text, flags=re.IGNORECASE)
      # Look for section indicators to remove any extra tokens at start of the line
      pattern = r'(I\.|II\.|III\.|IV\.|V\.|VI\.|VII\.|VIII\.|IX\.|X\.|1\.|2\.|3\.|4\.|5\.|6\.|7\.|8\.|9\.|10\.|A\.|B\.|C\.|D\.|E\.|F\.|G\.|H\.|I\.|J\.)'
      match = re.search(pattern, text)

      if match:
        text = text[match.start():]

      if text: # Ensure non-empty, non-whitespace only sections are kept
        processed_text.append(text)

    split_text = [s for s in split_text if s and s.strip()]
    return processed_text

In [None]:
import re

def remove_conclusion(content_text):
  # Find all matches of "CONCLUSION" using re.finditer, which returns an iterator yielding match objects
  matches = list(re.finditer(r'CONCLUSIONS?\b', content_text, flags=re.MULTILINE | re.IGNORECASE))

  if matches:
    # If matches are found, take the last match
    last_match = matches[-1]
    # Remove conclusion and everything after the last occurrence of "CONCLUSION"
    content_text = content_text[:last_match.start()]

  return content_text

In [None]:
print(toc_df.head)

<bound method NDFrame.head of                         filename  \
0     Docket20-5279_Brief007.pdf   
1     Docket20-5279_Brief008.pdf   
2     Docket20-5279_Brief009.pdf   
3     Docket20-5279_Brief010.pdf   
4      Docket20-828_Brief001.pdf   
...                          ...   
3974  Docket16-1027_Brief009.pdf   
3975  Docket16-1027_Brief010.pdf   
3976   Docket17-387_Brief001.pdf   
3977   Docket17-387_Brief002.pdf   
3978   Docket17-387_Brief003.pdf   

                                                   text  \
0     No. 20-5279  \n \nIN THE \nSupreme Court of th...   
1     No. 20-5279 \nIN THE \nSupreme Court of the Un...   
2      \n No. 20-5279  \nIn the Supreme Court of the...   
3      \n \n \n \n \n \nNo. 20-5279 \n \n In the Sup...   
...                                                 ...   
3974  No. 16-1027\nIn the Supreme Court of the Unite...   
3976   \n \nNo. 17 -387 \n \n \nIN THE \nSUPREME COU...   
3977   \n No. 17-387 \nIn the Supreme Court of the U...   
3978  

In [None]:
toc_df['arguments'] = toc_df['toc'].apply(clean_table_of_contents)

In [None]:
toc_df['content'] = toc_df['content'].apply(lambda x: remove_conclusion(x) if pd.notnull(x) else x)

In [None]:
def clean_content(content):
  return re.sub(r'(?<![\.\?!])\n(?!\n)', ' ', content)

In [None]:
toc_df['content'] = toc_df['content'].apply(lambda x: clean_content(x) if pd.notnull(x) else x)

In [None]:
print(toc_df.head)

<bound method NDFrame.head of                         filename  \
0     Docket20-5279_Brief007.pdf   
1     Docket20-5279_Brief008.pdf   
2     Docket20-5279_Brief009.pdf   
3     Docket20-5279_Brief010.pdf   
4      Docket20-828_Brief001.pdf   
...                          ...   
3974  Docket16-1027_Brief009.pdf   
3975  Docket16-1027_Brief010.pdf   
3976   Docket17-387_Brief001.pdf   
3977   Docket17-387_Brief002.pdf   
3978   Docket17-387_Brief003.pdf   

                                                   text  \
0     No. 20-5279  \n \nIN THE \nSupreme Court of th...   
1     No. 20-5279 \nIN THE \nSupreme Court of the Un...   
2      \n No. 20-5279  \nIn the Supreme Court of the...   
3      \n \n \n \n \n \nNo. 20-5279 \n \n In the Sup...   
...                                                 ...   
3974  No. 16-1027\nIn the Supreme Court of the Unite...   
3976   \n \nNo. 17 -387 \n \n \nIN THE \nSUPREME COU...   
3977   \n No. 17-387 \nIn the Supreme Court of the U...   
3978  

In [None]:
import re
from fuzzywuzzy import process, fuzz

# match headers to sections and delete the headers within sections to avoid data leaks
def new_match_headers_to_sections(row, threshold=70):
    headers = row['arguments']
    content = row['content']

    # Check if either headers or content is None or if headers is not a list
    if headers is None or content is None or not isinstance(headers, list):
        return None

    lines = content.split('\n')
    matched_sections = {}
    section_starts = []

    # Find match points for headers
    for header in headers:
        high_score = 0
        best_match = None
        for line in lines:
            score = fuzz.token_sort_ratio(header, line)
            if score > high_score:
                high_score = score
                best_match = line
            if score > threshold:
                break  # Assuming line order in content follows headers logically
        '''
        print(f"The header is: {header}")
        print(f"Best match is: {best_match}")
        print(f"The score is: {high_score}")
        '''
        if best_match and high_score > threshold:
            match_index = lines.index(best_match)
            section_starts.append(match_index)
            matched_sections[header] = best_match

    # Create sections from matches
    sections = []
    for i in range(len(section_starts)):
        start_idx = section_starts[i]
        end_idx = section_starts[i + 1] if i + 1 < len(section_starts) else len(lines)
        section_content = "\n".join(lines[start_idx:end_idx])
        # Match the last 4 words in the header to the section to find its end, and slice off the header.
        # Use regex to find the position of the last four words of the header in the section_content
        last_four_words = ' '.join(headers[i].split()[-4:])  # Get last four words of the header
        # print(f"The last 4 words are: {last_four_words}")
        regex_pattern = re.escape(last_four_words) + r'.*?(?=\n|$)'  # Regex to find these words followed by anything until a newline or end of string
        match = re.search(regex_pattern, section_content, re.DOTALL)  # DOTALL to make '.' match newlines as well
        # print(f"The match is {match}")
        if match:
            # Update section_content to start after the matched header
            section_content = section_content[match.end():].strip()  # Start after the end of the match
        # Replace empty or whitespace-only strings with None
        if not section_content.strip():
            section_content = None
        sections.append({headers[i]: section_content})

    # sections_json = json.dumps(sections, indent=4) # Possibly return a JSON instead
    return sections



In [None]:
toc_df['sections'] = toc_df.apply(new_match_headers_to_sections, axis=1, threshold=80)

KeyboardInterrupt: 

Confirm that extraction worked properly

In [None]:
mask = toc_df['sections'].apply(lambda x: x == [] or x is None)

empty_sec_df = toc_df[mask].reset_index(drop=True)
# Invert the mask to keep rows where the condition is False
df_complete = toc_df[~mask]

print(f"There are {len(empty_sec_df)} rows with empty sections")

In [None]:
print(empty_sec_df.head)

In [None]:
print(df_complete.head)

In [None]:
print(toc_df['sections'].head)

In [None]:
print(toc_df['sections'].iloc[0])

In [None]:
regex_toc_df = toc_df['toc'].apply(clean_table_of_contents) # Apply regex to the whole column, save as new df to test

In [None]:
idx = 3000
print(toc_df.iloc[idx]['toc'])

In [None]:
for thing in regex_toc_df.iloc[idx]:
  print(thing)

In [None]:
idx = 3
print("TOC:")
print(toc_df.iloc[idx]["toc"])
print("CONTENTS:")
print(toc_df.iloc[idx]["content"])

In [None]:
print(toc_df.iloc[idx]["text"])

In [None]:
idx = 3812
print("TOC:")
print(toc_df.iloc[idx]["toc"])
print("CONTENTS:")
print(toc_df.iloc[idx]["content"])

In [None]:
print(toc_df.iloc[idx]["text"])

In [None]:
# Unclear if this is necessary to clean up null rows
# Create a boolean mask where True indicates rows to be removed
mask = toc_df['toc'].isnull() | (toc_df['toc'].str.len() < 100)

# Invert the mask to keep rows where the condition is False
df_cleaned = toc_df[~mask]

mask = toc_df['toc'].isnull() | (toc_df['toc'].str.len() < 100)
empty_toc_df = toc_df[mask].reset_index(drop=True)

print(f"There are {len(empty_toc_df)} rows with empty toc")

In [None]:
print(empty_toc_df.head)

In [None]:
mask = (toc_df['text'].str.len() < 15000)

# Invert the mask to keep rows where the condition is False
df_cleaned = toc_df[~mask]

empty_text_df = toc_df[mask].reset_index(drop=True)

print(f"There are {len(empty_text_df)} rows with empty text")

In [None]:
print(empty_toc_df.head)

In [None]:
# print(empty_toc_df.iloc[3]['text'])

In [None]:
mask = (toc_df['content'].str.len() < 10000)

# Invert the mask to keep rows where the condition is False
df_cleaned = toc_df[~mask]

short_content_df = toc_df[mask].reset_index(drop=True)

print(f"There are {len(short_content_df)} rows with short content")

In [None]:
print(short_content_df.head)

In [None]:
idx = 0
print(short_content_df.iloc[idx]['toc'])

In [None]:
print(short_content_df.iloc[idx]['content'])

In [None]:
text = """
There isn’t one thing that stands out that I can say:
That’s it.  That’s what I miss the most.”  Id. at 48-17.
Sean’s stepfather, Joseph Rogers, said that Sean
“was a cop at an early age.”  Id. at 48-20.  During col-
lege, Sean “was a volunteer for the Somerville Auxil-iary Police” and “was the youngest sergeant they had
ever had.”  Id. at 48-22.  Once Sean graduated from
college, the Somerville Police Department “sponsored him to the MBTA Transit Police Academy,” and “[i]n
2010, he graduated from the MBTA Police Academy
with “the highest grade point average of anybody who had ever graduated.”  Id. at 48-22 to -23.  The day he

surviving spectators’ testimony had relevance to the jury’s
weighing of aggravating factors other than victim impact.”  Pet. App. 98a.
17
graduated from the Academy was “[p]robably the hap-
piest day of his life.”  Id. at 48-27 to -28.
Mr. Rogers recounted how he learned that Sean
had been murdered:  “they took us to see Sean. * * *
He had a hole in the middle of his head and he was
shot to pieces.  And he’s la ying there.  They don’t re-
ally clean you up much; they just wipe off the blood.
And my wife is touching him and his blood is coming
up in her hands.”  Id. at 48-29.  Since Sean’s death,
his mother has “been diagnosed with having post-
traumatic stress disorder.  She keeps remembering
that night and being told, wh at he looked like, and it
runs over in her mind.”  Id. at 48-29 to -30.  Each of
Sean’s six siblings was severely impacted:  one sibling
“moved to Texas and that way it’s easier for her not to talk about it.”  Id. at 48-32.  Another sibling “has had
to deal with a lot of the press, the unending press that
we get, and that’s been ve ry difficult on her and her
marriage.”  Id.
The jury also heard vict im-impact testimony from
the family members of the three people murdered by
the bombing, plus testimony from many injured sur-
vivors.  The survivors testified about how the shrapnel
bombs that respondent detonated mutilated their bod-ies, and how the bombings unleashed a flood of psy-
chological and emotional torm ent.  One survivor testi-
fied that he was “in a very dark place” and “not want-ing to live” anymore.  Pet. App. 100a.  Another was
unable to testify because he checked himself in to a
mental-health facility as a result of the bombings.  Id.
Because the court of appeals vacated respondent’s
death sentence, the distri ct court must “empanel a
new jury, and preside over a new trial strictly limited
to what penalty [respondent] should get on the death-
eligible counts.”  Pet. App. 3a (citation omitted).  That
"""

# Count characters
num_characters = len(text)

print("Number of characters in the text:", num_characters)

Try using OpenAI

In [None]:
import os
from openai import OpenAI

text = toc_df.iloc[0]["text"]
key = "sk-proj-HPuEBXpZczveh29SCWUvT3BlbkFJfqW3TJTN0n3xVmF3egEp"
client = OpenAI(api_key=key)
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "system", "content": "You are a helpful legal assistant who specializes in reading legal documents and extracting exact text."},
    {"role": "user", "content": f"Read this text and extract the extract the arguments in the form of section headings that appear in the table of contents. Extract the arguments exactly as they appear in the table, preserving indentation, linebreak, and things like roman numerals and lettering of sections. {text}"},
  ]
)

In [None]:
print(response.choices[0].message.content)

In [None]:
print(text)