In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline, AutoModel
import json
import textwrap
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from langchain.memory import ConversationBufferMemory
import pandas as pd
import time
import gc
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
torch.set_default_device('cuda')

# Setting up LegalBert Large for extractive summarization

In [2]:
model_directory = "../legal-bert-large"
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModel.from_pretrained(model_directory)

In [3]:
device = 'cuda'
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(32000, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [4]:
# Getting the majority opinion from the CaseLaw json file
def load_and_extract_data(file_path):
    
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for o in data["casebody"]["data"]["opinions"]:
        if o["type"] == "majority":
            return o["text"]
        else:
            return None

# Grab chunks of text to summarize
# It has an overlap to make sure each chunk has context of the previous chuck
def chunk_text_with_overlap(text, chunk_word_count, overlap_word_count):
    words = text.split()
    chunks = []
    index = 0

    while index < len(words):
        current_chunk_end = index + chunk_word_count
        current_chunk_end = min(current_chunk_end, len(words))
        chunk = " ".join(words[index:current_chunk_end])
        chunks.append(chunk)

        index += chunk_word_count - overlap_word_count

        # force it to advance to avoid an infinite loop
        if index >= current_chunk_end:
            index = current_chunk_end

    return chunks

# Sort of overkill on saving a summary to a text file
# Some naming logic and error checking added in
def save_summary_to_text(summary, output_folder, file_path, condensed=False):
    
    base_name = os.path.splitext(os.path.basename(file_path))[0]
    if condensed:
        summary_file_name = f"{base_name}_condensed_summary.txt"
    else:
        summary_file_name = f"{base_name}_summary.txt"
    
    summary_file_path = os.path.join(output_folder, summary_file_name)

    try:
        with open(summary_file_path, 'w', encoding='utf-8') as file:
            file.write(summary)
        print(f"Summary successfully written to {summary_file_name}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [5]:
# Calculates cosine similarity for sentence embeddings
# Pair sentences with scores, then sorts in descending order
# Pick top 'num_sentence' number of sentences
# START: REFACTOR FROM <https://towardsdatascience.com/extractive-summarization-using-bert-966e912f4142 and https://www.analyticsvidhya.com/blog/2023/03/exploring-the-extractive-method-of-text-summarization/>
# Also used GPT-4 in debugging
def extractive_summarization(text, num_sentences):

    # Use NLTK's sentence tokenizer to split the text into individual sentences
    sentences = sent_tokenize(text)
    
    tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
    
    max_len = 0
    for i in tokenized_sentences:
        if len(i) > max_len:
            max_len = len(i)
    
    padded_sentences = []
    for i in tokenized_sentences:
        while len(i) < max_len:
            i.append(0)
        padded_sentences.append(i)
        
    input_ids = torch.tensor(padded_sentences)
    
    attention_mask = [[float(i != 0.0) for i in seq] for seq in padded_sentences]
    attention_mask = torch.tensor(attention_mask)
    attention_mask = attention_mask.to(device)
    
    input_ids = input_ids.to(device)
    
    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)[0]
    
    sentence_embeddings = []
    for i in range(len(sentences)):
        sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).cpu().numpy())
        
    similarity_matrix = cosine_similarity(sentence_embeddings)
    
    sentence_scores = [sum(similarity_matrix[i]) for i in range(len(sentences))]
    
    sentence_score_pairs = list(enumerate(sentence_scores))
    
    sorted_sentences = sorted(sentence_score_pairs, key=lambda x: x[1], reverse=True)
    
    summary_sentences = [sentences[index] for index, _ in sorted_sentences[:num_sentences]]
    
    summary = ' '.join(summary_sentences)
    
    return summary
# END: REFACTOR FROM <https://towardsdatascience.com/extractive-summarization-

In [6]:
# Summarizing a case document and using a csv file to keep track of
# This code has been refactored seeveral times so the name and batch_size are kind of outdated
def summarize_a_batch_of_case_documents(batch_size, processed_files_csv):
    start = time.time()
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name", "time_elapsed"])
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                try:
                    summarize_a_case_document(filename)
                except:
                    print(f"Error processing {filename}")
                processed_count += 1
                end = time.time()
                new_row = {"file_name": filename,
                           "time_elapsed": end - start
                          }
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)

# This code summarizes a case when given a json file
# It gets the major opinion, chucks it, summarizes it, then save it as an individual txt file
def summarize_a_case_document(filename):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    chunk_word_count = 1000
    overlap_word_count = 200
    
    chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    chunk_summaries = summarize_chunks(chunks, chunk_word_count)
    final_summary = ' '.join(chunk_summaries)
    
    save_summary_to_text(final_summary, output_folder, file_path, condensed=False)

# It's here that we decided the number of sentences to use in extractive summarization
# we picked between 2 and 7 sentences
# given that chunk in roughly 1000 words, we would add a sentence every 100 words
def summarize_chunks(chunks, chunk_word_count):
    min_sentences = 2
    standard_summary_length = 10
    max_sentences = 7

    summaries = []

    for chunk in chunks:
        chunk_length = len(chunk.split())
        proportional_sentences_number = int((chunk_length / chunk_word_count) * standard_summary_length)
        
        sentences_to_summarize = max(proportional_sentences_number, min_sentences)

        sentences_to_summarize = min(max_sentences, proportional_sentences_number)
        
        # Perform extractive summarization on the chunk using 'sentences_to_summarize' as the number of sentences to include in the summary
        summary = extractive_summarization(chunk, sentences_to_summarize)
        summaries.append(summary)
        
    return summaries

In [7]:
# demo_folder = 'Local-LLM-Code'
input_folder = 'input_files_from_CaseLaw'
output_folder = 'output_summaries_from_LegalBert_large'
# output_folder = os.path.join(demo_folder, output_folder_name)
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
processed_files_csv = 'processed_files_for_demo.csv'

In [8]:
for i in range(len(os.listdir(input_folder))):
    summarize_a_batch_of_case_documents(1, processed_files_csv)
    torch.cuda.empty_cache()
    gc.collect()

In [9]:
list_of_bert_summaries = []
for filename in os.listdir(output_folder):
    file_path = os.path.join(output_folder, filename)
    file = open(file_path, "r")
    bert_summary = file.read()
    list_of_bert_summaries.append(bert_summary)
    file.close()

In [10]:
print(list_of_bert_summaries[0])

Lastly, the joinder of William as a respondent will also serve the child’s interest in having her paternity decided swiftly and finally, for a decision rendered in a proceeding in which he is not a party cannot bind him, and leaves open the possibility of a later order declaring him to be the father (see, Matter of Tyrone G. v Fifi N., supra, at 14; cf., Matter of Cathleen P. v Gary P., 63 NY2d 805, 808, supra). [Sandra C.] v Thomas J.S., 100 AD2d 119, 122-123; 1 Schatkin, Disputed Paternity Proceedings § 8.08 [4th rev ed]), has made it more realistic to view a paternity proceeding as a means of actually and conclusively determining the identity of a child’s biological father (see, Matter of Commissioner of Social Servs. Moreover, with the joinder of William as an "alleged father”, the court can order him to submit to a blood test (see, Family Ct Act § 532 [a]; CPLR 3121 [a]), the results of which, if they exclude him as the child’s father, will provide the clear and convincing evidenc

In [11]:
print(list_of_bert_summaries[1])

French Lilly further testified that some time thereafter the policy written by Mrs. Foster was mailed to him, inasmuch as plaintiff also resided in Wyoming County, and that he thereupon mailed the policy to the plaintiff, not at that time being aware of the fact that the policy was written for the same man who had been refused insurance previously by the witness. As a result of the telephone conversation, the plaintiff called upon Mrs. Foster, who issued the insurance policy in question. Later, however, the home office learned of the false answer contained in the application; and on March 8, 1957, wrote a letter to the plaintiff detailing the facts in relation to the false answer contained in the application, and notifying the plaintiff that his insurance policy was rescinded. At the time the letter of December 20 was written, the home office did not know of the accident involving the cow, or of the occurrence which resulted in the destruction of the automobile, and did not know of the

# Full Sentence Holding Generation

In [12]:
llama_model_directory = "../Llama-2-7b-chat-hf"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_model_directory)

llama_model = AutoModelForCausalLM.from_pretrained(llama_model_directory,
                                                     device_map='auto',
                                                     torch_dtype=torch.float16,
                                                     load_in_4bit=True
                                                     )

In [13]:
llama_pipe = pipeline("text-generation",
                model=llama_model,
                tokenizer=llama_tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 1024,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

In [14]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = "You are a legal expert who specializes in extracting accurate and concise holdings from case documents."

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [15]:
instruction = "Use the case document to extract the concise holding. {text}"
system_prompt = "You are a legal expert who specializes in extracting accurate and concise holdings from case documents."

llama_llm = HuggingFacePipeline(pipeline = llama_pipe, model_kwargs = {'temperature':0})

llama_template = get_prompt(instruction, system_prompt)

llama_prompt = PromptTemplate(template=llama_template, input_variables=["text"])

llama_llm_chain = LLMChain(prompt=llama_prompt, llm=llama_llm)

In [16]:
list_of_holdings = []
summaries_folder = 'output_summaries_from_LegalBert_large'
for summary in list_of_bert_summaries:
    holding = llama_llm_chain.run(summary)
    list_of_holdings.append(holding)

In [17]:
print(list_of_holdings[0])

  The holding in this case is that the court may order the joinder of a man who is alleged to be the biological father of a child, even if he is not a party to the proceeding, in order to determine the child's paternity. The court may order the man to submit to a blood test to determine his paternity, and if he refuses, an adverse inference may be drawn against him. The court found that the results of a human leucocyte antigen test showing a 99.53% probability that the petitioner is the child's father were sufficient to overcome the presumption of legitimacy that arises when a child is born to a married woman.


In [18]:
print(list_of_holdings[1])

  The holding of the case is that the defendant company had the right to rescind the insurance policy issued to the plaintiff due to the plaintiff's fraudulent misrepresentations in the application. The court held that the defendant company had knowledge of the fraudulent answers and misrepresentations through its agent in Oceana, and therefore had an election to either treat the policy as void or rescind it as of a future date. Since the defendant company elected to rescind the policy, the loss occurred prior to the date of rescinding, and the defendant was entitled to deduct the value of the automobile from the amount of the plaintiff's recovery. The court also held that the plaintiff could not rely on an estoppel not set forth in his reply, and that the policy was thereby forfeited under the circumstances of the case.


# Parenthetical Generation

In [19]:
instruction = "Use the case document to extract the concise holding and phrase it as a parenthetical, which should look something like this: holding that the balance between costs and benefits comes out against applying the exclusionary rule in civil deportation hearings. {text}"
system_prompt = "You are a legal expert who specializes in extracting accurate and concise parenthetical holdings from case documents. Give only the holdings, no other breakdowns or extra text."

llama_llm = HuggingFacePipeline(pipeline = llama_pipe, model_kwargs = {'temperature':0})

llama_template = get_prompt(instruction, system_prompt)

llama_prompt = PromptTemplate(template=llama_template, input_variables=["text"])

llama_llm_chain = LLMChain(prompt=llama_prompt, llm=llama_llm)

In [20]:
list_of_parentheticals = []
summaries_folder = 'output_summaries_from_LegalBert_large'
for summary in list_of_bert_summaries:
    parenthetical = llama_llm_chain.run(summary)
    list_of_parentheticals.append(parenthetical)

In [21]:
print(list_of_parentheticals[0])

  Here are the concise holdings from the case document:

1. The balance between costs and benefits comes out against applying the exclusionary rule in civil deportation hearings.
2. Joinder of William as a respondent will serve the child’s interest in having her paternity decided swiftly and finally, and leaves open the possibility of a later order declaring him to be the father.
3. Where a mother’s husband has been a substantial presence in the child’s life and desires to continue to exercise parental rights, the need for joining him as a party whose interests might be inequitably affected by the resulting order of filiation is manifest, and may be ordered by the court on its own motion.
4. If William refuses to submit to a blood test, an adverse inference may be drawn against him.


In [22]:
print(list_of_parentheticals[1])

  Holding: The court held that the policy was forfeited due to the plaintiff's fraudulent misrepresentations on the application.
