In [None]:
import os
from dotenv import load_dotenv
from huggingface_hub import login
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
import json
import textwrap
from langchain import HuggingFacePipeline
from langchain import PromptTemplate,  LLMChain
from langchain.memory import ConversationBufferMemory
import pandas as pd
import time
import gc
torch.set_default_device('cuda')

In [None]:
model_directory = "./Mistral-7B-OpenOrca"
tokenizer = AutoTokenizer.from_pretrained(model_directory)

model = AutoModelForCausalLM.from_pretrained(model_directory,
                                             torch_dtype="auto",
                                             device_map='auto',
                                             )

In [None]:
pipe = pipeline("text-generation",
                model=model,
                tokenizer= tokenizer,
                torch_dtype=torch.bfloat16,
                device_map="auto",
                max_new_tokens = 1024,
                do_sample=True,
                top_k=30,
                num_return_sequences=1,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id = 3200
                )

In [None]:
test_df = pd.read_json('cleaned_test_qlora.jsonl', lines=True)

In [None]:
instruction = "Use the case document to extract the concise holding and phrase it as a parenthetical, which should look something like this: holding that the balance between costs and benefits comes out against applying the exclusionary rule in civil deportation hearings. {text}"
system_prompt = "You are a legal expert who specializes in extracting accurate and concise parenthetical holdings from case documents. Give only the holdings, no other breakdowns or extra text."

In [None]:
chat = [
  {"role": "system", "content": system_prompt},
  {"role": "user", "content": instruction}
]

llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0})

input_text = tokenizer.apply_chat_template(chat, add_generation_prompt=True, tokenize=False)

prompt = PromptTemplate(template=input_text, input_variables=["text"])

llm_chain = LLMChain(prompt=prompt, llm=llm)

In [None]:
mistral_test_output = llm_chain.run(test_df.iloc[0]["input"])

In [None]:
mistral_test_output

In [None]:
results_df_mistral = pd.DataFrame(columns=["Input", "Prediction", "Reference"])
num_nulls = 0

for i in range(len(test_df)):
    print(f"Predicting on input number: {i}")
    input_txt = test_df.iloc[i]["input"]
    # output_txt = llm_chain.run(input_txt)
    
    try:
        output_txt = llm_chain.run(input_txt)
    except RuntimeError:
        print("Generation failed, inserting NULL value")
        output_txt = "NULL"
        num_nulls += 1
    
    reference_txt = test_df.iloc[i]["output"]
    
    temp_df = pd.DataFrame({'Input': [input_txt], 'Prediction': [output_txt], 'Reference': [reference_txt]})
    
    results_df_mistral = pd.concat([results_df_mistral, temp_df], ignore_index=True)
    
    torch.cuda.empty_cache()
    gc.collect()
print("Inference has finished")

In [None]:
print(f"The number of nulls inserted is {num_nulls}")

In [None]:
results_df_mistral.to_csv('mistral_parenthetical_predictions.csv', index=False)