In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import json
import textwrap
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from langchain.memory import ConversationBufferMemory
import pandas as pd
import time
import gc

In [None]:
load_dotenv()
HUGGINGFACEHUB_API_TOKEN = os.getenv("HF_AUTH_TOKEN")
login(token=HUGGINGFACEHUB_API_TOKEN)

# This notebook is a combination the various models used to generate holdings and some summaries
## - To use this notebook, roll to the section you want to use and only run those cells

# Skip the next two cells if you don't want to use Llama 2

In [None]:
model_directory = "./Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_directory)

model = AutoModelForCausalLM.from_pretrained(model_directory,
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             # load_in_8bit=True,
                                             load_in_4bit=True
                                             )

In [None]:
# START: REFACTORED FROM <https://colab.research.google.com/drive/1Ssg-fffeJ0LG0m3DoTofeLPvOUQyG1h3?usp=sharing>
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 1024,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id
                )
# END: REFACTORED FROM <https://colab.research.google.com/drive/1Ssg-fffeJ0LG0m3DoTofeLPvOUQyG1h3?usp=sharing>

# The cell below has important functions
## It's best to run it, regardless of which task and model you're using

In [None]:
# START: COPIED FROM <https://colab.research.google.com/drive/1Ssg-fffeJ0LG0m3DoTofeLPvOUQyG1h3?usp=sharing>
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template
# END: COPIED FROM <https://colab.research.google.com/drive/1Ssg-fffeJ0LG0m3DoTofeLPvOUQyG1h3?usp=sharing>

def count_words(input_string):
    words = input_string.split(" ")
    return len(words)

def summarize_chunks(chunks, model, tokenizer):
    summaries = []
    for chunk in chunks:
        output = llm_chain.run(chunk)
        summaries.append(output)
    return summaries

def create_final_summary(summaries):
    # Option 1: Just join the summaries
    final_summary = ' '.join(summaries)

    # Option 2: Apply another round of summarization (We found this didn't work for holding factuality)
    # final_summary = generate(final_summary)

    return final_summary

# Grab chunks of text to summarize
# It has an overlap to make sure each chunk has context of the previous chuck
def chunk_text_with_overlap(text, chunk_word_count, overlap_word_count):
    words = text.split()
    chunks = []
    index = 0

    while index < len(words):
        current_chunk_end = index + chunk_word_count
        current_chunk_end = min(current_chunk_end, len(words))
        chunk = " ".join(words[index:current_chunk_end])
        chunks.append(chunk)

        index += chunk_word_count - overlap_word_count

        # force it to advance to avoid an infinite loop
        if index >= current_chunk_end:
            index = current_chunk_end

    return chunks

# Getting the majority opinion from the CaseLaw json file
def load_and_extract_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    for o in data["casebody"]["data"]["opinions"]:
        if o["type"] == "majority":
            return o["text"]
        else:
            return None

# Sort of overkill on saving a summary to a text file
# Some naming logic and error checking added in
def save_summary_to_text(summary, output_folder, file_path, condensed=False):
    base_name = os.path.splitext(os.path.basename(file_path))[0]

    if condensed:
        summary_file_name = f"{base_name}_condensed_summary.txt"
    else:
        summary_file_name = f"{base_name}_summary.txt"
    
    summary_file_path = os.path.join(output_folder, summary_file_name)

    try:
        with open(summary_file_path, 'w', encoding='utf-8') as file:
            file.write(summary)
        print(f"Summary successfully written to {summary_file_name}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def read_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except IOError as e:
        print(f"Error reading file {file_path}: {e}")
        return None

# Bulk Opinion Summarizing

In [None]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Summarize the following case document. Remember to include the relevant facts and rules of the case. {text}"
system_prompt = "You are a legal expert who specializes in summarizing case documents while including the legal facts."

template = get_prompt(instruction, system_prompt)

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None

llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Summarize the following case document. Remember to include the relevant facts and rules of the case. {text}"
system_prompt = "You are a legal expert who specializes in summarizing case documents while including the legal facts."

template = get_prompt(instruction, system_prompt)

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

# Summarizing a case document and using a csv file to keep track of
# This code has been refactored seeveral times so the name and batch_size are kind of outdated
def summarize_a_batch_of_case_documents(batch_size):
    processed_files_csv = 'processed_files_for_summarizing_small.csv'

    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename)
                processed_count += 1

                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    chunk_word_count = 1000
    overlap_word_count = 200
    
    chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    chunk_summaries = summarize_chunks(chunks, model, tokenizer)
    final_summary = create_final_summary(chunk_summaries)
    
    # Save the summary to a text file
    save_summary_to_text(final_summary, output_folder, file_path, condensed=False)

In [None]:
for i in range(16):
    start = time.time()
    summarize_a_batch_of_case_documents(1)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()

In [None]:
!nvidia-smi

In [None]:
torch.cuda.empty_cache()

In [None]:
!nvidia-smi

## Bulk Summarization without special prompting

In [None]:
# Summarizing a case document and using a csv file to keep track of
# This code has been refactored seeveral times so the name and batch_size are kind of outdated
def summarize_a_batch_of_case_documents(batch_size, processed_files_csv):
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename)
                processed_count += 1 

                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    chunk_word_count = 1000
    overlap_word_count = 200
    
    chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    chunk_summaries = summarize_chunks(chunks, model, tokenizer)
    final_summary = create_final_summary(chunk_summaries)
    
    save_summary_to_text(final_summary, output_folder, file_path, condensed=False)

In [None]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Summarize the following: {text}"
system_prompt = "You are an expert summarizier."

template = get_prompt(instruction, system_prompt)

prompt = PromptTemplate(template=template, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_llama_regular_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
processed_files_csv = 'processed_files_for_llama_regular_summarizing_small.csv'

In [None]:
for i in range(len(os.listdir(input_folder))):
    start = time.time()
    summarize_a_batch_of_case_documents(1, processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()

# Bulk Holding Generation with Llama 2

In [None]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Use the case document to extract the concise holding. {text} Here is a hint on the holding: {hint}"
system_prompt = "You are a legal expert who specializes in extracting accurate and concise holdings from case documents."

template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["text", "hint"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
reference_df = pd.read_csv('case_references.csv')

In [None]:
# Creates a holding based on a summary by running the langchain pipeline
def generate_holding(filename):
    file_path = os.path.join(input_folder, filename)
    summary = read_file(file_path)
    current_case_summary = str(summary)

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    case_number = int(base_name.split("_")[1])

    holding_hint = reference_df.iloc[case_number - 1, 6]
    print(holding_hint)

    input_dict = {'text': current_case_summary,
                  'hint': holding_hint
                 }
    
    output = llm_chain.run(input_dict)

    holding_file_name = f"{base_name}_holding.txt"
    
    holding_file_path = os.path.join(output_folder, holding_file_name)

    try:
        with open(holding_file_path, 'w', encoding='utf-8') as file:
            file.write(output)
        print(f"Summary successfully written to {holding_file_path}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def batch_process_holdings(batch_size, processed_files_csv):
    
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename in processed_files_df['file_name'].values:
                print(f"File {filename} has already been processed. Skipping.")
                continue
        else:
            generate_holding(filename)
            processed_count += 1

            new_row = {"file_name": filename}
            processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
            processed_files_df.to_csv(processed_files_csv, index=False)

In [None]:
input_folder = 'ref_case_llama_regular_summaries_small'
output_folder = 'ref_case_llama_regular_sum_llama_holdings_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
processed_files_csv = 'processed_files_llama_regular_sum_llama_for_holdings_small.csv'

for i in range(len(os.listdir(input_folder))):
    start = time.time()
    batch_process_holdings(1,processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

# Bulk Holding Generation Without Hint 
## - Using Llama2

In [None]:
llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})
instruction = "Use the case document to extract the concise holding. {text}"
system_prompt = "You are a legal expert who specializes in extracting accurate and concise holdings from case documents."

template = get_prompt(instruction, system_prompt)
print(template)

prompt = PromptTemplate(template=template, input_variables=["text"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
reference_df = pd.read_csv('case_references.csv')

In [None]:
def generate_holding(filename):
    file_path = os.path.join(input_folder, filename)
    summary = read_file(file_path)
    current_case_summary = str(summary)

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    case_number = int(base_name.split("_")[1])

    holding_hint = reference_df.iloc[case_number - 1, 6]
    print(holding_hint)

    input_dict = {'text': current_case_summary,
                  'hint': holding_hint
                 }
    
    output = llm_chain.run(input_dict)

    holding_file_name = f"{base_name}_holding.txt"
    
    holding_file_path = os.path.join(output_folder, holding_file_name)

    try:
        with open(holding_file_path, 'w', encoding='utf-8') as file:
            file.write(output)
        print(f"Summary successfully written to {holding_file_path}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def batch_process_holdings(batch_size, processed_files_csv):
    
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename in processed_files_df['file_name'].values:
                print(f"File {filename} has already been processed. Skipping.")
                continue
        else:
            generate_holding(filename)
            processed_count += 1
            new_row = {"file_name": filename}
            processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
            processed_files_df.to_csv(processed_files_csv, index=False)

In [None]:
input_folder = 'ref_case_legalBertLarge_summaries_small'
output_folder = 'ref_case_legalBertLarge_sum_llama_holdings_without_hint_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
processed_files_csv = 'processed_files_legalBertLarge_sum_llama_for_holdings_without_hint_small.csv'

for i in range(len(os.listdir(input_folder))):
    start = time.time()
    batch_process_holdings(1,processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

# Bulk Holding Generation with Mistral
## Will use between 16 and 21 Gb of GPU RAM

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
torch.set_default_device('cuda')

In [None]:
model_directory = "./Mistral-7B-OpenOrca"
tokenizer = AutoTokenizer.from_pretrained(model_directory)

model = AutoModelForCausalLM.from_pretrained(model_directory,
                                             torch_dtype="auto",
                                             device_map='auto',
                                             )

In [None]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 1024,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id = 3200
                )

In [None]:
# START: COPIED FROM <https://colab.research.google.com/drive/1uTJvyjhH-mvi1AmuwAOL2384X8TugGf0?usp=sharing>
text = """<|im_start|>system\n
You are MistralOrca, a large language model trained by Alignment Lab AI. Write out your reasoning step-by-step to be sure you get the right answers!\n
<|im_end|>\n
<|im_start|>user\n
what is the meaning of life?\n
<|im_end|>"""
# END: COPIED FROM <https://colab.research.google.com/drive/1uTJvyjhH-mvi1AmuwAOL2384X8TugGf0?usp=sharing>

device = 'cuda'

def generate_holding(filename, reference_df, instruction, system_prompt, max_length, hint_bool):
    file_path = os.path.join(input_folder, filename)
    summary = read_file(file_path)
    current_case_summary = str(summary)
    # print("\ncurrent_case_summary: ", current_case_summary)

    # Check and truncate current_case_summary if needed
    tokens = tokenizer(current_case_summary, return_tensors="pt", add_special_tokens=False)
    while tokens.input_ids.size(1) > max_length - 500:
        print("\n\n\nTruncation")

        # Truncate the text from the back
        current_case_summary = " ".join(current_case_summary.split(' ')[:-1])
        # tokens = tokenizer(current_case_summary, return_tensors="pt", add_special_tokens=False)

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    case_number = int(base_name.split("_")[1])

    holding_hint = reference_df.iloc[case_number - 1, 6]

    if hint_bool is True:
        input_dict = {'text': current_case_summary,
                      'hint': holding_hint
                     }
    else:
        # instruction = instruction.format(text=current_case_summary)
        input_dict = {'text': current_case_summary,
                      'hint': holding_hint
                     }
    output = llm_chain.run(input_dict)
    print("output: ", output)

    holding_file_name = f"{base_name}_holding.txt"
    
    holding_file_path = os.path.join(output_folder, holding_file_name)

    try:
        with open(holding_file_path, 'w', encoding='utf-8') as file:
            file.write(output)
        print(f"Summary successfully written to {holding_file_path}")
    except IOError as e:
        print(f"Unable to write to file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def batch_process_holdings(batch_size, processed_files_csv, reference_df, instruction, system_prompt, max_length, hint_bool=False):
    
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename in processed_files_df['file_name'].values:
                print(f"File {filename} has already been processed. Skipping.")
                continue
        else:
            generate_holding(filename, 
                             reference_df = reference_df, 
                             instruction = instruction,
                             system_prompt = system_prompt,
                             max_length = max_length,
                             hint_bool = hint_bool
                            )
            processed_count += 1
            new_row = {"file_name": filename}
            processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
            processed_files_df.to_csv(processed_files_csv, index=False)

In [None]:
case_summary = "French Lilly further testified that some time thereafter the policy written by Mrs. Foster was mailed to him, inasmuch as plaintiff also resided in Wyoming County, and that he thereupon mailed the policy to the plaintiff, not at that time being aware of the fact that the policy was written for the same man who had been refused insurance previously by the witness. As a result of the telephone conversation, the plaintiff called upon Mrs. Foster, who issued the insurance policy in question. Later, however, the home office learned of the false answer contained in the application; and on March 8, 1957, wrote a letter to the plaintiff detailing the facts in relation to the false answer contained in the application, and notifying the plaintiff that his insurance policy was rescinded. At the time the letter of December 20 was written, the home office did not know of the accident involving the cow, or of the occurrence which resulted in the destruction of the automobile, and did not know of the fact that the application made by plaintiff contained a false statement of facts. French Lilly of Oceana, in Wyoming County, an agent for the defendant company, testified that “in the neighborhood of July in 1956”, the plaintiff showed to the witness a letter disclosing that the plaintiff’s automobile insurance with another company was being cancelled and that thereupon the witness refused to write a policy of insurance on behalf of the defendant company covering plaintiff’s automobile. As a consequence thereof, the home office wrote a letter to the plaintiff, dated December 20, 1956, three days after the automobile was demolished, notifying the plaintiff that his insurance policy would be cancelled as of January 2, 1957, and advising him to obtain proper insurance in the meantime with another company. The defendant offered the letter of March 8 for introduction in evidence, the plaintiff objected, and the court refused to permit the introduction of such letter as a part of the evidence to be considered by the jury. The defendant offered the letter of March 8 for introduction in evidence, the plaintiff objected, and the court refused to permit the introduction of such letter as a part of the evidence to be considered by the jury. Thereupon, the court made the following statement to the jury: “Ladies and gentlemen of the jury, there are two questions to be decided in this case. After the completion of the testimony at the trial before a jury, the defendant made a motion for a directed verdict in its favor, which motion was overruled. In his written opinion, which was made a part of the record, the eminent trial judge stated: “I am of the opinion that the knowledge of the agent of the company at Oceana is imputed to its principal, the defendant company, and the defendant company having had knowledge of a prior cancellation through its agent and in its principal offices, had an election to either treat the policy as void or rescind it as of a future date. The company elected to rescind and prior to the date of rescinding, the loss occurred.” Prior to the time of the trial, the defendant filed its specification of defense in accordance with the provisions of Code, 56-4-21, a portion of which was as follows: “That the plaintiff fraudulently procured the said policy of insurance by the wilful, intentional, making of false and fraudulent answers and misrepresentations upon his application for said policy of insurance, with full knowledge at the time of making said false statements and answers that the defendant would not have issued said policy of insurance had the defendant known that the statements and answers as given were false. The trial court deducted this sum from $2,800.00, the value placed on the automobile by the jury’s verdict, and entered judgment for the plaintiff for the balance, amounting to $271.45. The defendant, however, in order to save the point, took testimony in relation to the letter outside the presence and hearing of the jury, and the letter was made a part of the record. 2 syl., 140 S. E. 61; Kincaid v. Equitable Life Assur. 45 C.J.S., Insurance, Section 605, page 438. SchWarzbach v. Ohio Valley Protective Union, 25 W. Va. 622, syl. “The insurer is not precluded from setting up the falsity of answers in the application where it appears that insured knew at the time they were being written or before signing the application that they were written falsely in order to defraud the company.” 45 C.J.S., Insurance, Section 732, page 741. It has been held that, under such a state of facts, “the policy is thereby forfeited.” Saltesz v. The Sovereign Camp of the Woodmen of the World, 110 W. Va. 513, syl., 159 S. E. 513. “Plaintiff cannot, at the trial or in the appellate court, rely on an estoppel not set forth in his reply.” Capehart v. Mutual Benefit Health and Accident Ass’n., 111 W. Va. 317, syl., 161 S. E. 609. Health & Accident v. Ratcliffe, 163 Va. 325, 175 S. E. 870, 874. It has been held that, under such circumstances, “the policy will ordinarily be forfeited.” Faulkiner v. Equitable Life Insurance Co., 144, W. Va. 193, syl., 107 S. E. 2d 360. It has been held that, under such a state of facts, “the policy is thereby forfeited.” Saltesz v. The Sovereign Camp of the Woodmen of the World, 110 W. Va. 513, syl., 159 S. E. 513. Although in some jurisdictions it is held that a contract of insurance procured by fraud is void, as a general rule, such a contract is voidable at the option of insurer on discovery of the fraud.” 45 C.J.S., Insurance, Section 473(2), page 152."

In [None]:
instruction = "Use the case document to extract the concise holding. {text}"
# instruction = instruction.format(text=case_summary)
system_prompt="You are MistralOrca, a legal expert who specializes in extracting accurate and concise holdings from case documents. Write out your answer short and succinct!"

llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})


chat = [
  {"role": "system", "content": system_prompt},
  {"role": "user", "content": instruction}
]

input_text = tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)

prompt = PromptTemplate(template=input_text, input_variables=["text"])
print("prompt: ", prompt)
llm_chain = LLMChain(prompt=prompt, llm=llm)
output = llm_chain.run(case_summary)
print("output: ", output)

In [None]:
input_folder = 'ref_case_legalBertLarge_summaries_small'
output_folder = 'ref_case_legalBertLarge_sum_Mistral_holdings_without_hint_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
reference_df = pd.read_csv('case_references.csv')
processed_files_csv = 'processed_files_legalBertLarge_sum_for_Mistral_holdings_without_hint_small.csv'
max_length = 4096
instruction = "Use the case document to extract the concise holding. {text}"
system_prompt="You are MistralOrca, a legal expert who specializes in extracting accurate and concise holdings from case documents. Write out your answer short and succinct!",

for i in range(len(os.listdir(input_folder))):
# for i in range(1):
    start = time.time()
    batch_process_holdings(1,processed_files_csv, reference_df, instruction, system_prompt, max_length, hint_bool=False)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

# Bulk Holding Generation with Mistral Using Hints

In [None]:
input_folder = 'ref_case_legalBertLarge_summaries_small'
output_folder = 'ref_case_legalBertLarge_sum_Mistral_holdings_with_hint_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
reference_df = pd.read_csv('case_references.csv')
processed_files_csv = 'processed_files_legalBertLarge_sum_for_Mistral_holdings_with_hint_small.csv'
max_length = 4096
instruction = "Use the case document to extract the concise holding. {text} Here is a hint on the holding: {hint}"
system_prompt="You are MistralOrca, a legal expert who specializes in extracting accurate and concise holdings from case documents. Write out your answer short and succinct!",

for i in range(len(os.listdir(input_folder))):
# for i in range(1):
    start = time.time()
    batch_process_holdings(1,processed_files_csv, reference_df, instruction, system_prompt, max_length, hint_bool=True)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

# Longformer Bulk Summarization
## currently does not chunk text larger than context window
## Will modify in the future if this proves to be a viable route

In [None]:
from transformers import LEDForConditionalGeneration, LEDTokenizer, pipeline, AutoModel

In [None]:
model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384')
tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")

In [None]:
# LED_model_folder = "./LED"
# os.makedirs(LED_model_folder, exist_ok=True)
# model.save_pretrained(LED_model_folder)
# tokenizer.save_pretrained(LED_model_folder)

In [None]:
# LED_model_folder = "./LED"
# tokenizer = LEDForConditionalGeneration.from_pretrained(LED_model_folder)
# model = LEDTokenizer.from_pretrained(LED_model_folder)
LED_summarizer_pipeline = pipeline("summarization", model=model, tokenizer=tokenizer, device=0)  # 'device=0' to use the GPU

In [None]:
def summarize_a_batch_of_case_documents(batch_size, processed_files_csv):
    
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename)
                processed_count += 1
                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    # chunk_word_count = 1000  # for instance, around 1000 words per chunk
    # overlap_word_count = 200  # for instance, overlap of 200 words
    
    # chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    summary = LED_summarizer_pipeline(opinion, min_length=30, max_length=1000)
    summary_text = summary[0]['summary_text']

    # Save the summary to a text file
    save_summary_to_text(summary_text, output_folder, file_path, condensed=False)

In [None]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_LED_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
processed_files_csv = 'processed_files_for_LED_summarizing_small.csv'

In [None]:
for i in range(len(os.listdir(input_folder))):
    start = time.time()
    summarize_a_batch_of_case_documents(1, processed_files_csv)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()

# Long T5 Bulk Summarization
## Currently does not chunk text larger than context window
## Will modify in the future if this proves to be a viable route

In [None]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration, pipeline
model = (
    LongT5ForConditionalGeneration.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
    .to("cuda")
    .half()
)
tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")

In [None]:
Long_T5_summarizer_pipeline = pipeline("summarization", model=model, tokenizer=tokenizer, device=0)  # 'device=0' to use the GPU

In [None]:
def summarize_a_batch_of_case_documents(batch_size, processed_files_csv, pipeline):
 
    if os.path.exists(processed_files_csv):
        processed_files_df = pd.read_csv(processed_files_csv)
    else:
        processed_files_df = pd.DataFrame(columns=["file_name"])
    
    processed_count = 0
    filenames = sorted(os.listdir(input_folder))
    
    # Start from where the last entry left off
    last_processed_index = 0
    if not processed_files_df.empty:
        last_filename = processed_files_df['file_name'].iloc[-1]
        last_processed_index = filenames.index(last_filename) + 1

    # Process files starting from the last processed one
    for filename in filenames[last_processed_index:]:
        if processed_count >= batch_size:
            break
        print("Processing: ", filename)
        if filename.endswith(".json"):
            if filename in processed_files_df['file_name'].values:
                    print(f"File {filename} has already been processed. Skipping.")
                    continue
            else:
                summarize_a_case_document(filename, pipeline)
                processed_count += 1

                new_row = {"file_name": filename}
                processed_files_df = pd.concat([processed_files_df, pd.DataFrame([new_row])], ignore_index=True)
                processed_files_df.to_csv(processed_files_csv, index=False)
                
def summarize_a_case_document(filename, pipeline):
    file_path = os.path.join(input_folder, filename)
    opinion = load_and_extract_data(file_path)
    opinion = str(opinion)
    
    # chunk_word_count = 1000  # for instance, around 1000 words per chunk
    # overlap_word_count = 200  # for instance, overlap of 200 words
    
    # chunks = chunk_text_with_overlap(opinion, chunk_word_count, overlap_word_count)

    summary = pipeline(opinion, min_length=30, max_length=1000)
    summary_text = summary[0]['summary_text']

    save_summary_to_text(summary_text, output_folder, file_path, condensed=False)

In [None]:
input_folder = 'ref_case_small'
output_folder = 'ref_case_Long_T5_summaries_small'
os.makedirs(output_folder, exist_ok=True)
processed_files_df = None
processed_files_csv = 'processed_files_for_Long_T5_summarizing_small.csv'

In [None]:
for i in range(len(os.listdir(input_folder))):
    start = time.time()
    summarize_a_batch_of_case_documents(1, processed_files_csv, Long_T5_summarizer_pipeline)
    end = time.time()
    print("Time elapsed:", end - start)
    torch.cuda.empty_cache()
    gc.collect()