In [1]:
import datasets
from bs4 import BeautifulSoup
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
import re
from evaluation import evaluate

  from .autonotebook import tqdm as notebook_tqdm
2024-04-17 07:05:47.396366: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
PATH_TO_OWL = './LMSS.owl'
LLM_PATH = '../Llama-2-7b-chat-hf'

In [3]:
model = AutoModelForCausalLM.from_pretrained(LLM_PATH)
tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

Loading checkpoint shards: 100%|██████████| 6/6 [02:40<00:00, 26.80s/it]


In [5]:
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer,torch_dtype=torch.float16,device=1)

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 1 has a total capacity of 39.43 GiB of which 22.31 MiB is free. Process 502330 has 2.38 GiB memory in use. Process 673277 has 34.25 GiB memory in use. Including non-PyTorch memory, this process has 2.79 GiB memory in use. Of the allocated memory 1.83 GiB is allocated by PyTorch, and 1.97 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
def get_llama_response(prompt):
    sequences = pipe(
        prompt,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id
        # max_length=256,
        # truncation=True
    )
    return sequences[0]['generated_text']

READING OWL

In [None]:
# Read the contents of the .owl file
with open(PATH_TO_OWL, "r") as owl_file:
    owl_data = owl_file.read()

# Parse the OWL data using BeautifulSoup
soup = BeautifulSoup(owl_data, 'xml')

In [None]:
# Initialize lists to store data
labels = []
definitions = []

# Find all instances of <owl:Class> elements and extract label and definition
for owl_class in soup.find_all('owl:Class'):
    label_element = owl_class.find('rdfs:label')
    definition_element = owl_class.find('skos:definition')
    
    # Check if label and definition elements exist
    if label_element and definition_element:
        label = label_element.text.strip()
        definition = definition_element.text.strip()
        
        # Append data to lists
        labels.append(label)
        definitions.append(definition)

data = {'Label': labels, 'Definition': definitions}
owl_df = pd.DataFrame(data)

owl_df

Fn to get classes

In [None]:
def filter_label_by_substring(df, substring):
    """
    Filter DataFrame rows containing the specified substring in the 'Label' column
    and return a list of strings in the format "{Label} : {Definition}".
    
    Args:
        df (pandas.DataFrame): Input DataFrame.
        substring (str): Substring to search for.
        
    Returns:
        list: List of strings in the format "{Label} : {Definition}" for matching rows.
    """
    filtered_df = df[df['Label'].str.contains(substring, case=False)]
    output_list = []
    for index, row in filtered_df.iterrows():
        output_list.append(f"{row['Label']} : {row['Definition']}")
    return output_list


In [None]:
#example usage
search_substring = 'hearsay'
result = filter_label_by_substring(owl_df, search_substring)
print(result)

In [None]:
dataset_hearsay = datasets.load_dataset("nguha/legalbench", "hearsay")

In [None]:
test_df = dataset_hearsay['test'].to_pandas()
test_df

In [None]:
prompts = test_df["text"].tolist()
prompts

In [None]:
def read_tsv(file_path):
    examples = []
    with open(file_path, 'r') as file:
        next(file)
        for line in file:
            index, answer, text, _ = line.strip().split('\t')
            examples.append((text, answer))
    return examples

# Read examples from test.tsv
examples = read_tsv('tasks/hearsay/train.tsv')

print(examples)

In [None]:
def get_reason(question, answer):
    if answer == 'Yes':
        return get_llama_response(question + " Why is this statement hearsat, Answer logically in short?")
        
    else:
        return get_llama_response(question + " Why this statement is not hearsay, Answer logically in short")

# Adding reasons to the examples
examples_with_reason = [(question, answer, get_reason(question, answer)) for question, answer in examples]

print(examples_with_reason)

In [None]:
def add_labels_and_definitions_to_prompt(prompt_text, filtered_labels, examples_with_reason):
    """
    Add filtered labels, definitions, and examples to the prompt.

    Args:
        prompt_text (str): The prompt text.
        filtered_labels (list): List of strings containing labels and definitions.
        examples_with_reason (list): List of tuples containing examples, answers, and reasons.

    Returns:
        str: The full prompt text with filtered labels, definitions, and examples added.
    """
    # Initialize full_prompt with prompt_text
    full_prompt = f"""
    Statement : {prompt_text}
    Question: Consider utilizing the following legal ontology classes to frame your argument:
    """

    # Add filtered labels and definitions to the prompt
    for label_definition in filtered_labels:
        full_prompt += f"\n{label_definition}"

    # Add examples with question, answer, and reason to the prompt
    full_prompt += "\n\nExamples to build a basic foundation of the task:"
    for example, answer, reason in examples_with_reason:
        full_prompt += f"\n\nExample: {example}\nAnswer: {answer}\nReason: {reason}"

    # Add the remaining part of the prompt
    full_prompt += """
    Use these ontology classes to structure your argument and analyze whether the information provided falls under the category of hearsay.

    Output Format: Classify it as hearsay or not hearsay

    Answer: 
    """
    # Hearsay or not hearsay
    
    return full_prompt
