In [1]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import json
import textwrap
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from langchain.memory import ConversationBufferMemory
import pandas as pd
import time
import gc

In [2]:
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_AUTH_TOKEN")
login(token=HUGGINGFACEHUB_API_TOKEN)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


# Skip the next two cells if you don't want to use Llama 2

In [3]:
model_directory = "./Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_directory)

model = AutoModelForCausalLM.from_pretrained(model_directory,
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             # load_in_8bit=True,
                                             load_in_4bit=True
                                             )

In [4]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 1024,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )

# The cell below has important functions
## It's best to run it, regardless of which task and model you're using

In [2]:
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

def cut_off_text(text, prompt):
    cutoff_phrase = prompt
    index = text.find(cutoff_phrase)
    if index != -1:
        return text[:index]
    else:
        return text

def remove_substring(string, substring):
    return string.replace(substring, "")



def generate(text):
    prompt = get_prompt(text)
    with torch.autocast('cuda', dtype=torch.bfloat16):
        inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
        outputs = model.generate(**inputs,
                                 max_new_tokens=1024,
                                 eos_token_id=tokenizer.eos_token_id,
                                 pad_token_id=tokenizer.eos_token_id,
                                 )
        final_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        final_outputs = cut_off_text(final_outputs, '</s>')
        final_outputs = remove_substring(final_outputs, prompt)

    return final_outputs#, outputs

def parse_text(text):
        wrapped_text = textwrap.fill(text, width=100)
        print(wrapped_text +'\n\n')
        # return assistant_text

def count_words(input_string):
    words = input_string.split(" ")
    return len(words)

def summarize_chunks(chunks, model, tokenizer):
    summaries = []
    for chunk in chunks:
        output = llm_chain.run(chunk)
        # print(count_words(output))
        # parse_text(output)
        summaries.append(output)
    return summaries

def create_final_summary(summaries):
    # Option 1: Just join the summaries
    final_summary = ' '.join(summaries)

    # Option 2: Apply another round of summarization (can be useful for coherence)
    # final_summary = generate(final_summary)  # This is recursive and might degrade quality

    return final_summary

def chunk_text_with_overlap(text, chunk_word_count, overlap_word_count):
    words = text.split()
    chunks = []
    index = 0

    while index < len(words):
        # Calculate the end index for the current chunk
        current_chunk_end = index + chunk_word_count
        
        # We don't want to overshoot the list of words for the current chunk
        current_chunk_end = min(current_chunk_end, len(words))

        # Create the chunk
        chunk = " ".join(words[index:current_chunk_end])
        chunks.append(chunk)

        # Calculate the start index for the next chunk (considering overlap)
        index += chunk_word_count - overlap_word_count

        # If the calculated index doesn't advance (due to large overlap), we force it to advance to avoid an infinite loop
        if index >= current_chunk_end:
            index = current_chunk_end

    return chunks

# Function to load data from the JSON file and extract the desired information.
def load_and_extract_data(file_path):
    # Reading the file.
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)  # Parsing the JSON data.

    for o in data["casebody"]["data"]["opinions"]:
        if o["type"] == "majority":
            return o["text"]
        else:
            return None

def save_summary_to_text(summary, output_folder, file_path, condensed=False):
    """
    Save the content of 'summary' to a text file derived from the name of the input file.
    """
    # Extract the base file name without extension
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    # Construct the new file name for the summary
    if condensed:
        summary_file_name = f"{base_name}_condensed_summary.txt"
    else:
        summary_file_name = f"{base_name}_summary.txt"
    
    # Construct the full path for the summary file
    summary_file_path = os.path.join(output_folder, summary_file_name)

    try:
        # Using 'with' for proper file closure
        with open(summary_file_path, 'w', encoding='utf-8') as file:
            file.write(summary)
        print(f"Summary successfully written to {summary_file_name}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def read_file(file_path):
    """
    Read the content of a text file.

    :param file_path: str, path to the file to read.
    :return: str, content of the file.
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except IOError as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# Bulk Opinion Summarizing

In [5]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Summarize the following case document. Remember to include the relevant facts and rules of the case. {text}"
system_prompt = "You are a legal expert who specializes in summarizing case documents while including the legal facts."

template = get_prompt(instruction, system_prompt)

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [8]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None

llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Summarize the following case document. Remember to include the relevant facts and rules of the case. {text}"
system_prompt = "You are a legal expert who specializes in summarizing case documents while including the legal facts."

template = get_prompt(instruction, system_prompt)

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

def summarize_a_batch_of_case_documents(batch_size):
    # Define the filename for the processed files DataFrame
    processed_files_csv = 'processed_files_for_summarizing_small.csv'
    
    # Check if the CSV file exists and load it, otherwise create an empty DataFrame
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    # Counter for the number of processed files in the batch
    processed_count = 0

    # Retrieve all filenames, sorted to ensure consistency across runs
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename)
                processed_count += 1  # Increment the processed files counter

                # Add the file to the processed DataFrame
                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    chunk_word_count = 1000  # for instance, around 1000 words per chunk
    overlap_word_count = 200  # for instance, overlap of 200 words
    
    chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    chunk_summaries = summarize_chunks(chunks, model, tokenizer)
    final_summary = create_final_summary(chunk_summaries)
    
    # Save the summary to a text file
    save_summary_to_text(final_summary, output_folder, file_path, condensed=False)

In [9]:
for i in range(16):
    start = time.time()
    summarize_a_batch_of_case_documents(1)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()

Processing:  case_103.json
Summary successfully written to case_103_summary.txt
Time elapsed: 1371.5429515838623
Processing:  case_104.json
Summary successfully written to case_104_summary.txt
Time elapsed: 486.6129982471466
Processing:  case_116.json
Summary successfully written to case_116_summary.txt
Time elapsed: 621.269606590271
Processing:  case_135.json




Summary successfully written to case_135_summary.txt
Time elapsed: 393.5242073535919
Processing:  case_146.json




Summary successfully written to case_146_summary.txt
Time elapsed: 743.0797629356384
Processing:  case_175.json




Summary successfully written to case_175_summary.txt
Time elapsed: 914.5540976524353
Processing:  case_225.json




Summary successfully written to case_225_summary.txt
Time elapsed: 552.0695128440857
Processing:  case_231.json




Summary successfully written to case_231_summary.txt
Time elapsed: 751.2099440097809
Processing:  case_235.json




Summary successfully written to case_235_summary.txt
Time elapsed: 732.7441337108612
Processing:  case_242.json




Summary successfully written to case_242_summary.txt
Time elapsed: 1061.977353811264
Processing:  case_249.json




Summary successfully written to case_249_summary.txt
Time elapsed: 1015.3821969032288
Processing:  case_278.json




Summary successfully written to case_278_summary.txt
Time elapsed: 754.6447155475616
Processing:  case_284.json




Summary successfully written to case_284_summary.txt
Time elapsed: 518.210547208786
Processing:  case_289.json




Summary successfully written to case_289_summary.txt
Time elapsed: 867.6442766189575
Processing:  case_316.json




Summary successfully written to case_316_summary.txt
Time elapsed: 812.6879885196686
Processing:  case_79.json




Summary successfully written to case_79_summary.txt
Time elapsed: 1034.909318447113


In [10]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed Nov  1 21:52:09 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.07             Driver Version: 537.34       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090 Ti     On  | 00000000:06:00.0  On |                  Off |
|  0%   51C    P5              46W / 450W |  10346MiB /

In [11]:
torch.cuda.empty_cache()

In [12]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Wed Nov  1 21:52:09 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.07             Driver Version: 537.34       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090 Ti     On  | 00000000:06:00.0  On |                  Off |
|  0%   51C    P5              45W / 450W |  10346MiB /

## Bulk Summarization without special prompting

In [14]:
def summarize_a_batch_of_case_documents(batch_size, processed_files_csv):
    # Check if the CSV file exists and load it, otherwise create an empty DataFrame
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    # Counter for the number of processed files in the batch
    processed_count = 0

    # Retrieve all filenames, sorted to ensure consistency across runs
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename)
                processed_count += 1  # Increment the processed files counter

                # Add the file to the processed DataFrame
                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    chunk_word_count = 1000  # for instance, around 1000 words per chunk
    overlap_word_count = 200  # for instance, overlap of 200 words
    
    chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    chunk_summaries = summarize_chunks(chunks, model, tokenizer)
    final_summary = create_final_summary(chunk_summaries)
    
    # Save the summary to a text file
    save_summary_to_text(final_summary, output_folder, file_path, condensed=False)

In [15]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Summarize the following: {text}"
system_prompt = "You are an expert summarizier."

template = get_prompt(instruction, system_prompt)

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [16]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_llama_regular_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
# Define the filename for the processed files DataFrame
processed_files_csv = 'processed_files_for_llama_regular_summarizing_small.csv'

In [17]:
for i in range(len(os.listdir(input_folder))):
    start = time.time()
    summarize_a_batch_of_case_documents(1, processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()

Processing:  case_103.json




Summary successfully written to case_103_summary.txt
Time elapsed: 936.8628714084625
Processing:  case_104.json




Summary successfully written to case_104_summary.txt
Time elapsed: 276.73224091529846
Processing:  case_116.json




Summary successfully written to case_116_summary.txt
Time elapsed: 399.6183753013611
Processing:  case_135.json




Summary successfully written to case_135_summary.txt
Time elapsed: 266.5895538330078
Processing:  case_146.json




Summary successfully written to case_146_summary.txt
Time elapsed: 208.6289381980896
Processing:  case_175.json




Summary successfully written to case_175_summary.txt
Time elapsed: 723.4104678630829
Processing:  case_225.json




Summary successfully written to case_225_summary.txt
Time elapsed: 446.85349225997925
Processing:  case_231.json




Summary successfully written to case_231_summary.txt
Time elapsed: 496.8048174381256
Processing:  case_235.json




Summary successfully written to case_235_summary.txt
Time elapsed: 495.87283086776733
Processing:  case_242.json




Summary successfully written to case_242_summary.txt
Time elapsed: 909.6987643241882
Processing:  case_249.json




Summary successfully written to case_249_summary.txt
Time elapsed: 664.7188773155212
Processing:  case_278.json




Summary successfully written to case_278_summary.txt
Time elapsed: 508.27871775627136
Processing:  case_284.json




Summary successfully written to case_284_summary.txt
Time elapsed: 212.46659922599792
Processing:  case_289.json




Summary successfully written to case_289_summary.txt
Time elapsed: 584.2650899887085
Processing:  case_316.json




Summary successfully written to case_316_summary.txt
Time elapsed: 525.4488999843597
Processing:  case_335.json




Summary successfully written to case_335_summary.txt
Time elapsed: 220.56410121917725
Processing:  case_352.json




Summary successfully written to case_352_summary.txt
Time elapsed: 637.4101104736328
Processing:  case_79.json




Summary successfully written to case_79_summary.txt
Time elapsed: 551.267014503479


# Bulk Holding Generation with Llama 2

In [6]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Use the case document to extract the concise holding. {text} Here is a hint on the holding: {hint}"
system_prompt = "You are a legal expert who specializes in extracting accurate and concise holdings from case documents."

template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["text", "hint"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

[INST]<<SYS>>
You are a legal expert who specializes in extracting accurate and concise holdings from case documents.
<</SYS>>

Use the case document to extract the concise holding. {text} Here is a hint on the holding: {hint}[/INST]


In [7]:
reference_df = pd.read_csv('case_references.csv')

In [8]:
def generate_holding(filename):
    file_path = os.path.join(input_folder, filename)
    summary = read_file(file_path)
    current_case_summary = str(summary)

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    case_number = int(base_name.split("_")[1])

    holding_hint = reference_df.iloc[case_number - 1, 6]
    print(holding_hint)

    input_dict = {'text': current_case_summary,
                  'hint': holding_hint
                 }
    
    output = llm_chain.run(input_dict)

    holding_file_name = f"{base_name}_holding.txt"
    
    # Construct the full path for the summary file
    holding_file_path = os.path.join(output_folder, holding_file_name)

    try:
        # Using 'with' for proper file closure
        with open(holding_file_path, 'w', encoding='utf-8') as file:
            file.write(output)
        print(f"Summary successfully written to {holding_file_path}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def batch_process_holdings(batch_size, processed_files_csv):
    
    # Check if the CSV file exists and load it, otherwise create an empty DataFrame
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    # Counter for the number of processed files in the batch
    processed_count = 0

    # Retrieve all filenames, sorted to ensure consistency across runs
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename in processed_files_df['file_name'].values:
                print(f"File {filename} has already been processed. Skipping.")
                continue
        else:
            generate_holding(filename)
            processed_count += 1  # Increment the processed files counter

            # Add the file to the processed DataFrame
            new_row = {"file_name": filename}
            processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
            processed_files_df.to_csv(processed_files_csv, index=False)

In [9]:
input_folder = 'ref_case_llama_regular_summaries_small'
output_folder = 'ref_case_llama_regular_sum_llama_holdings_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
# Define the filename for the processed files DataFrame
processed_files_csv = 'processed_files_llama_regular_sum_llama_for_holdings_small.csv'

for i in range(len(os.listdir(input_folder))):
    start = time.time()
    batch_process_holdings(1,processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

Processing:  case_103_summary.txt
holding state as parens patriae takes strong interest in care and treatment of children within its borders
Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_103_summary_holding.txt
Time elapsed: 76.03627490997314
Processing:  case_104_summary.txt
holding in addition that offense of obstructing a law enforcement officer did not meet therein element of burglary
Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_104_summary_holding.txt
Time elapsed: 69.09998416900635
Processing:  case_116_summary.txt
holding that proximate cause is legal concept with particular meaning in law and is not in category of words or phrases commonly known and understood by lay public thus it was error not to give instruction defining it
Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_116_summary_holding.txt
Time elapsed: 47.456737756729126
Processing:  case_135_summary.txt




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_249_summary_holding.txt
Time elapsed: 92.91756248474121
Processing:  case_278_summary.txt
holding that fixing tickets by passing them to the inactive files without requiring the defendants to appear in court and over the objections of the issuing officer constituted moral turpitude




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_278_summary_holding.txt
Time elapsed: 61.606207609176636
Processing:  case_284_summary.txt
holding similarly where appellant claimed that she was entitled to recover attorney fees under nrs 18010 even though nrcp 68f and nrs 171154 foreclosed such a recovery




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_284_summary_holding.txt
Time elapsed: 50.36093878746033
Processing:  case_289_summary.txt
holding that juveniles may waive constitutional rights




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_289_summary_holding.txt
Time elapsed: 129.9863474369049
Processing:  case_316_summary.txt
holding that article i section 19 of the missouri constitution requires that once a witness claims the privilege against selfinerimination afforded by that provision a rebuttable presumption arises that the witness answer might tend to incriminate him




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_316_summary_holding.txt
Time elapsed: 59.22045588493347
Processing:  case_335_summary.txt
recognizing if truth about paternity can be discovered and equity does not demand otherwise presumption of legitimacy should not be used to perpetuate a falsehood




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_335_summary_holding.txt
Time elapsed: 109.90361285209656
Processing:  case_352_summary.txt
holding fjraud on the part of the insured in the procurement of the policy  is sufficient to defeat a recovery in an action on such policy




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_352_summary_holding.txt
Time elapsed: 64.18657112121582
Processing:  case_79_summary.txt
holding that appeals are taken from judgments and not from opinions let alone dicta




Summary successfully written to ref_case_llama_regular_sum_llama_holdings_small/case_79_summary_holding.txt
Time elapsed: 54.9709587097168


# Bulk Holding Generation Without Hint 
## - Using Llama2

In [11]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Use the case document to extract the concise holding. {text}"
system_prompt = "You are a legal expert who specializes in extracting accurate and concise holdings from case documents."

template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

[INST]<<SYS>>
You are a legal expert who specializes in extracting accurate and concise holdings from case documents.
<</SYS>>

Use the case document to extract the concise holding. {text}[/INST]


In [12]:
reference_df = pd.read_csv('case_references.csv')

In [13]:
def generate_holding(filename):
    file_path = os.path.join(input_folder, filename)
    summary = read_file(file_path)
    current_case_summary = str(summary)

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    case_number = int(base_name.split("_")[1])

    holding_hint = reference_df.iloc[case_number - 1, 6]
    print(holding_hint)

    input_dict = {'text': current_case_summary,
                  'hint': holding_hint
                 }
    
    output = llm_chain.run(input_dict)

    holding_file_name = f"{base_name}_holding.txt"
    
    # Construct the full path for the summary file
    holding_file_path = os.path.join(output_folder, holding_file_name)

    try:
        # Using 'with' for proper file closure
        with open(holding_file_path, 'w', encoding='utf-8') as file:
            file.write(output)
        print(f"Summary successfully written to {holding_file_path}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def batch_process_holdings(batch_size, processed_files_csv):
    
    # Check if the CSV file exists and load it, otherwise create an empty DataFrame
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    # Counter for the number of processed files in the batch
    processed_count = 0

    # Retrieve all filenames, sorted to ensure consistency across runs
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename in processed_files_df['file_name'].values:
                print(f"File {filename} has already been processed. Skipping.")
                continue
        else:
            generate_holding(filename)
            processed_count += 1  # Increment the processed files counter

            # Add the file to the processed DataFrame
            new_row = {"file_name": filename}
            processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
            processed_files_df.to_csv(processed_files_csv, index=False)

In [16]:
input_folder = 'ref_case_legalBertLarge_summaries_small'
output_folder = 'ref_case_legalBertLarge_sum_llama_holdings_without_hint_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
# Define the filename for the processed files DataFrame
processed_files_csv = 'processed_files_legalBertLarge_sum_llama_for_holdings_without_hint_small.csv'

for i in range(len(os.listdir(input_folder))):
    start = time.time()
    batch_process_holdings(1,processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

Processing:  case_103_summary.txt
holding state as parens patriae takes strong interest in care and treatment of children within its borders




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_103_summary_holding.txt
Time elapsed: 61.34729218482971
Processing:  case_104_summary.txt
holding in addition that offense of obstructing a law enforcement officer did not meet therein element of burglary




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_104_summary_holding.txt
Time elapsed: 55.870909214019775
Processing:  case_116_summary.txt
holding that proximate cause is legal concept with particular meaning in law and is not in category of words or phrases commonly known and understood by lay public thus it was error not to give instruction defining it




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_116_summary_holding.txt
Time elapsed: 63.08406114578247
Processing:  case_135_summary.txt
recognizing that an owner of property could be personally liable to one who voluntarily pays ad valorem taxes if the parties executed a valid contract or agreement




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_135_summary_holding.txt
Time elapsed: 55.00631761550903
Processing:  case_146_summary.txt
holding student loan proceeds did not lose exempt status after excess of proceeds after payment of tuition and fees was deposited in students bank account




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_146_summary_holding.txt
Time elapsed: 51.81976509094238
Processing:  case_175_summary.txt
holding that appellants testimony negated the intoxication defense by demonstrating he was in control of his mental faculties




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_175_summary_holding.txt
Time elapsed: 89.6313304901123
Processing:  case_225_summary.txt
holding the commission cannot rely on a va rating to find a claimant was totally disabled




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_225_summary_holding.txt
Time elapsed: 89.94023299217224
Processing:  case_231_summary.txt
recognizing that a dismissal without prejudice along with a limited opportunity to amend the complaint within twenty days only became final and binding when no amended pleadings were filed within the time period allowed




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_231_summary_holding.txt
Time elapsed: 66.58955407142639
Processing:  case_235_summary.txt
holding that a claim for interest on a money judgment did not accrue until the judgment was rendered which was the date on which the trial court issued its order




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_235_summary_holding.txt
Time elapsed: 103.6399302482605
Processing:  case_242_summary.txt
holding that a court may not rewrite the insurance contract under the guise of judicial interpretation




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_242_summary_holding.txt
Time elapsed: 58.29112195968628
Processing:  case_249_summary.txt
recognizing that intentional furnishing of false information of a material nature is breach of cooperation clause and discussing misrepresentations provided during investigation and trial of claim




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_249_summary_holding.txt
Time elapsed: 39.222641468048096
Processing:  case_278_summary.txt
holding that fixing tickets by passing them to the inactive files without requiring the defendants to appear in court and over the objections of the issuing officer constituted moral turpitude




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_278_summary_holding.txt
Time elapsed: 80.85450530052185
Processing:  case_284_summary.txt
holding similarly where appellant claimed that she was entitled to recover attorney fees under nrs 18010 even though nrcp 68f and nrs 171154 foreclosed such a recovery




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_284_summary_holding.txt
Time elapsed: 169.42029929161072
Processing:  case_289_summary.txt
holding that juveniles may waive constitutional rights




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_289_summary_holding.txt
Time elapsed: 45.93712663650513
Processing:  case_316_summary.txt
holding that article i section 19 of the missouri constitution requires that once a witness claims the privilege against selfinerimination afforded by that provision a rebuttable presumption arises that the witness answer might tend to incriminate him




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_316_summary_holding.txt
Time elapsed: 60.90019989013672
Processing:  case_335_summary.txt
recognizing if truth about paternity can be discovered and equity does not demand otherwise presumption of legitimacy should not be used to perpetuate a falsehood




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_335_summary_holding.txt
Time elapsed: 68.29979920387268
Processing:  case_352_summary.txt
holding fjraud on the part of the insured in the procurement of the policy  is sufficient to defeat a recovery in an action on such policy




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_352_summary_holding.txt
Time elapsed: 54.50455331802368
Processing:  case_79_summary.txt
holding that appeals are taken from judgments and not from opinions let alone dicta




Summary successfully written to ref_case_legalBertLarge_sum_llama_holdings_without_hint_small/case_79_summary_holding.txt
Time elapsed: 54.875834226608276


# Bulk Holding Generation with Mistral

# Longformer Bulk Summarization
## currently does not chunk text larger than context window
## Will modify in the future if this proves to be a viable route

In [4]:
from transformers import LEDForConditionalGeneration, LEDTokenizer, pipeline, AutoModel

In [5]:
model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384')
tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")

In [6]:
# LED_model_folder = "./LED"
# os.makedirs(LED_model_folder, exist_ok=True)
# model.save_pretrained(LED_model_folder)
# tokenizer.save_pretrained(LED_model_folder)

In [14]:
# LED_model_folder = "./LED"
# tokenizer = LEDForConditionalGeneration.from_pretrained(LED_model_folder)
# model = LEDTokenizer.from_pretrained(LED_model_folder)
LED_summarizer_pipeline = pipeline("summarization", model=model, tokenizer=tokenizer, device=0)  # 'device=0' to use the GPU

In [34]:
def summarize_a_batch_of_case_documents(batch_size, processed_files_csv):
 
    # Check if the CSV file exists and load it, otherwise create an empty DataFrame
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    # Counter for the number of processed files in the batch
    processed_count = 0

    # Retrieve all filenames, sorted to ensure consistency across runs
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename)
                processed_count += 1  # Increment the processed files counter

                # Add the file to the processed DataFrame
                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    # chunk_word_count = 1000  # for instance, around 1000 words per chunk
    # overlap_word_count = 200  # for instance, overlap of 200 words
    
    # chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    summary = LED_summarizer_pipeline(opinion, min_length=30, max_length=1000)
    summary_text = summary[0]['summary_text']

    # Save the summary to a text file
    save_summary_to_text(summary_text, output_folder, file_path, condensed=False)

In [35]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_LED_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None

# Define the filename for the processed files DataFrame
processed_files_csv = 'processed_files_for_LED_summarizing_small.csv'

In [36]:
for i in range(len(os.listdir(input_folder))):
    start = time.time()
    summarize_a_batch_of_case_documents(1, processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

Processing:  case_104.json




Summary successfully written to case_104_summary.txt
Time elapsed: 7.562493801116943
Processing:  case_116.json




Summary successfully written to case_116_summary.txt
Time elapsed: 7.652428388595581
Processing:  case_135.json




Summary successfully written to case_135_summary.txt
Time elapsed: 6.961967468261719
Processing:  case_146.json




Summary successfully written to case_146_summary.txt
Time elapsed: 6.355541944503784
Processing:  case_175.json




Summary successfully written to case_175_summary.txt
Time elapsed: 6.683484077453613
Processing:  case_225.json




Summary successfully written to case_225_summary.txt
Time elapsed: 7.277122259140015
Processing:  case_231.json




Summary successfully written to case_231_summary.txt
Time elapsed: 6.84537148475647
Processing:  case_235.json




Summary successfully written to case_235_summary.txt
Time elapsed: 7.338288307189941
Processing:  case_242.json




Summary successfully written to case_242_summary.txt
Time elapsed: 7.209223985671997
Processing:  case_249.json




Summary successfully written to case_249_summary.txt
Time elapsed: 7.26111626625061
Processing:  case_278.json




Summary successfully written to case_278_summary.txt
Time elapsed: 7.177913665771484
Processing:  case_284.json




Summary successfully written to case_284_summary.txt
Time elapsed: 2.1981449127197266
Processing:  case_289.json




Summary successfully written to case_289_summary.txt
Time elapsed: 6.974234580993652
Processing:  case_316.json




Summary successfully written to case_316_summary.txt
Time elapsed: 7.456243276596069
Processing:  case_335.json




Summary successfully written to case_335_summary.txt
Time elapsed: 6.600247383117676
Processing:  case_352.json




Summary successfully written to case_352_summary.txt
Time elapsed: 7.210155963897705
Processing:  case_79.json




Summary successfully written to case_79_summary.txt
Time elapsed: 7.394057512283325
Time elapsed: 0.004387378692626953


# Long T5 Bulk Summarization
## Currently does not chunk text larger than context window
## Will modify in the future if this proves to be a viable route

In [37]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration, pipeline
model = (
    LongT5ForConditionalGeneration.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
    .to("cuda")
    .half()
)
tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")

Downloading (…)lve/main/config.json:   0%|          | 0.00/853 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.34k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [38]:
Long_T5_summarizer_pipeline = pipeline("summarization", model=model, tokenizer=tokenizer, device=0)  # 'device=0' to use the GPU

In [43]:
def summarize_a_batch_of_case_documents(batch_size, processed_files_csv, pipeline):
 
    # Check if the CSV file exists and load it, otherwise create an empty DataFrame
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    # Counter for the number of processed files in the batch
    processed_count = 0

    # Retrieve all filenames, sorted to ensure consistency across runs
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename, pipeline)
                processed_count += 1  # Increment the processed files counter

                # Add the file to the processed DataFrame
                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename, pipeline):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    # chunk_word_count = 1000  # for instance, around 1000 words per chunk
    # overlap_word_count = 200  # for instance, overlap of 200 words
    
    # chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    summary = pipeline(opinion, min_length=30, max_length=1000)
    summary_text = summary[0]['summary_text']

    # Save the summary to a text file
    save_summary_to_text(summary_text, output_folder, file_path, condensed=False)

In [44]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_Long_T5_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None

# Define the filename for the processed files DataFrame
processed_files_csv = 'processed_files_for_Long_T5_summarizing_small.csv'

In [45]:
for i in range(len(os.listdir(input_folder))):
    start = time.time()
    summarize_a_batch_of_case_documents(1, processed_files_csv, Long_T5_summarizer_pipeline)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

Processing:  case_103.json




Summary successfully written to case_103_summary.txt
Time elapsed: 9.881898880004883
Processing:  case_104.json




Summary successfully written to case_104_summary.txt
Time elapsed: 7.390232563018799
Processing:  case_116.json




Summary successfully written to case_116_summary.txt
Time elapsed: 6.77020788192749
Processing:  case_135.json




Summary successfully written to case_135_summary.txt
Time elapsed: 3.730167865753174
Processing:  case_146.json




Summary successfully written to case_146_summary.txt
Time elapsed: 58.532248735427856
Processing:  case_175.json




Summary successfully written to case_175_summary.txt
Time elapsed: 59.602532625198364
Processing:  case_225.json




Summary successfully written to case_225_summary.txt
Time elapsed: 6.0869269371032715
Processing:  case_231.json




Summary successfully written to case_231_summary.txt
Time elapsed: 61.00665855407715
Processing:  case_235.json




Summary successfully written to case_235_summary.txt
Time elapsed: 5.824497938156128
Processing:  case_242.json




Summary successfully written to case_242_summary.txt
Time elapsed: 60.64083981513977
Processing:  case_249.json




Summary successfully written to case_249_summary.txt
Time elapsed: 7.613674163818359
Processing:  case_278.json




Summary successfully written to case_278_summary.txt
Time elapsed: 5.797230005264282
Processing:  case_284.json




Summary successfully written to case_284_summary.txt
Time elapsed: 4.169192552566528
Processing:  case_289.json




Summary successfully written to case_289_summary.txt
Time elapsed: 4.906538963317871
Processing:  case_316.json




Summary successfully written to case_316_summary.txt
Time elapsed: 7.803145170211792
Processing:  case_335.json




Summary successfully written to case_335_summary.txt
Time elapsed: 5.496894121170044
Processing:  case_352.json




Summary successfully written to case_352_summary.txt
Time elapsed: 7.082246780395508
Processing:  case_79.json




Summary successfully written to case_79_summary.txt
Time elapsed: 57.95717000961304
