In [1]:
#  Import necessary libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig,pipeline
import os
import pandas as pd
import jsonlines
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainingArguments,Trainer
from peft import LoraModel, LoraConfig
from pprint import pprint

import datasets
from datasets import load_dataset
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain_voyageai import VoyageAIEmbeddings

# from prompt_template import template

### Loading Documents

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import MathpixPDFLoader
from langchain_community.document_loaders import UnstructuredPDFLoader

In [5]:
loader = PyMuPDFLoader("60.pdf")
# loader = PyMuPDFLoader("sample_docs.pdf")
loader

<langchain_community.document_loaders.pdf.PyMuPDFLoader at 0x7fd3ddb85300>

In [6]:
data = loader.load()

In [8]:
import pandas as pd

# Create an empty DataFrame
empty_df = pd.DataFrame()
empty_df
empty_df['content_length'] = [len(page.page_content) for page in data]
empty_df.describe()

### Spliting

In [11]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=250,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)


In [12]:
splits = text_splitter.split_documents(data)
splits

[Document(page_content='ACT CIVIL & ADMINISTRATIVE TRIBUNAL \n \n \n \n \nLEE v GUO (Residential Tenancies) [2017] ACAT 60 \n \nRT 166/2017 \n \nCatchwords: \nRESIDENTIAL TENANCIES – compensation – whether there \nwas a breach of peace, comfort and privacy – whether the tenant', metadata={'source': '60.pdf', 'file_path': '60.pdf', 'page': 0, 'total_pages': 29, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Microsoft® Word for Microsoft 365', 'producer': 'Microsoft® Word for Microsoft 365', 'creationDate': "D:20240329204618+05'30'", 'modDate': "D:20240329204618+05'30'", 'trapped': ''}),
 Document(page_content='needs to establish a ‘significant’ interference with the use of the \npremises – what ‘significant’ means in the context of section 71 \nof the RT Act – when considering whether there has been a', metadata={'source': '60.pdf', 'file_path': '60.pdf', 'page': 0, 'total_pages': 29, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': 

### Embedding and vectorStore

In [13]:
from langchain.vectorstores import Chroma

In [14]:
persist_directory = './chroma'

In [15]:
embeddings = VoyageAIEmbeddings(
    voyage_api_key="", model="voyage-large-2"
)

batch size None


In [16]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=persist_directory
)

In [20]:
vectordb.persist()

### Loading Model

In [77]:
from langchain_community.llms import HuggingFaceEndpoint
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"
API_KEY= ""

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    top_k=1,
    top_p=0.9,    
    temperature=0.1,
    
    huggingfacehub_api_token= API_KEY 
)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/user/.cache/huggingface/token
Login successful


#### Retrieval

In [79]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [80]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """
[INST] You are chatbot who is specialized in Legal field. 
Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. 
{context}
Question: {question}
Helpful Answer: [/INST] """
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [81]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [93]:
question1 = "What is the main theme discussed in this text?"
question2 = "What precedents are referred to in the text for further understanding of the theme?"
question3 = " Highlight the statutory aspects laid down in the texts."

In [98]:
result = qa_chain({"query": question1})
print(result["result"])

The main theme discussed in this text appears to be the interpretation and application of different provisions of the law, specifically sections 83(d) and 71, in relation to a landlord-tenant dispute. The text discusses the importance of considering the impact of a landlord's actions on a tenant's ability to use the premises as a home and lead a normal life. The landlord's actions, while perhaps not significant on their own, are described as having an "active adverse effect" on the tenant. The text also touches upon the difference in language used in the relevant sections of the law and the potential reasons for that difference.


In [95]:
result = qa_chain({"query": question2})
print(result["result"])

The text refers to the principles established in the cases of "importance"12 and "something more than ordinary"13, which suggest that a loss must have ordinary significance before it can be considered the probable result of a breach of contract. The tenant agrees that the individual incidents are of minor nature but submits that they should be considered collectively. The difference in language used in clause 52/section 83(d) and section 71 is not immediately evident without additional context.

As for precedents, the text mentions the cases of "importance"12 and "something more than ordinary"13, but it does not explicitly state which cases these are. To fully understand the theme, it would be helpful to know the specific precedents being referred to and how they have shaped the legal principles being discussed.


In [96]:
result = qa_chain({"query": question3})
print(result["result"])

1. Clause 52 and section 83(d) refer to a test for determining damages in contract law. The exact nature of the test is not clear without additional context.
2. Section 71 refers to a provision regarding damages in tort law. The test for determining damages in this section is also not explicitly stated but is described as being related to the "importance," "more than ordinary," or having an "active adverse effect on the ability of the claimant to lead the sort of life the claimant normally led."
3. The difference in language and tests between clause 52/section 83(d) and section 71 is not immediately clear and may be a quirk of the drafting.
4. The intentional difference between the tests, if any, is not explicitly stated in the provided context.
5. The determination of the tests' specifics and whether there is a difference requires a factual analysis.
