In [1]:
!pip install transformers einops accelerate langchain bitsandbytes sentence_transformers pypdf python-dotenv



In [2]:
!pip install typing-extensions==4.5.0 llama-index

Collecting typing-extensions==4.5.0
  Using cached typing_extensions-4.5.0-py3-none-any.whl (27 kB)
INFO: pip is looking at multiple versions of pydantic to determine which version is compatible with other requirements. This could take a while.
Collecting pydantic<3,>=1 (from langchain>=0.0.262->llama-index)
  Downloading pydantic-2.5.1-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.1/64.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting pydantic-core==2.14.3 (from pydantic<3,>=1->langchain>=0.0.262->llama-index)
  Downloading pydantic_core-2.14.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.5 kB)
Collecting pydantic<3,>=1 (from langchain>=0.0.262->llama-index)
  Downloading pydantic-2.5.0-py3-none-any.whl.metadata (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.6/174.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting pydantic

In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [1]:
import logging
import sys
import torch
from pprint import pprint
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
# from llama_index import LangchainEmbedding
from llama_index.prompts.prompts import SimpleInputPrompt

In [3]:
# !git config --global credential.helper store
# !huggingface-cli login

In [2]:
from pathlib import Path
from llama_index import download_loader

PDFReader = download_loader("PDFReader")


In [3]:
#question creation prompts
q_system_prompt = "You are an expert user extracting questions from a text. You will be passed a piece of text, write a list of questions that can be answered based *solely* on the given text."
query_wrapper_prompt = "<|USER|>{query_str}<|ASSISTANT|>"

#answer prompt
a_system_prompt = "You are a data extractor. Extract the exact data from given document. If no information found please reply 'no information available'"

In [4]:
# hyperparameters
context_window = 4096 #@param
temperature = 0.0 #@param
model_name = 'meta-llama/Llama-2-7b-chat-hf' #@param
my_model_name = 'litelo/llama-2-case-whisper'

In [5]:
from transformers import LlamaTokenizer, LlamaForCausalLM
tokenizer = LlamaTokenizer.from_pretrained(model_name)
model_llm = LlamaForCausalLM.from_pretrained(
        my_model_name,
        device_map='auto'
    )

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/root/oneapi-devsummit-sea-2023/itex_xpu/lib/python3.9/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


2023-12-15 18:59:21.575731: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:

def getLLMforPrompts(system_prompt, query_wrapper_prompt) :
    return HuggingFaceLLM(
        context_window=context_window,
        max_new_tokens=256,
        generate_kwargs={"temperature":temperature,"top_p":0.5, "do_sample": False},
        system_prompt= system_prompt,
        query_wrapper_prompt = query_wrapper_prompt,
        tokenizer=tokenizer,
        model=model_llm,
        device_map='auto',
        model_kwargs={"temperature":0.1, "top_p":0.5,"use_auth_token": True}
    )

In [7]:
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")


2023-12-15 18:59:43,886 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2023-12-15 18:59:44,690 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu


In [8]:

def getAllQuestions(response_text):
    j=0
    lines = response_text.split("\n")
    list_questions = []
    for i in range(0, len(lines)):
        if "?" in lines[i] and not lines[i][0].isalpha():
            for j in range(0, len(lines[i])):
                if lines[i][j].isalpha():
                    break
            list_questions.append(lines[i][j:].replace("  "," ").strip())
    if len(list_questions) == 0:
        for i in range(0, len(lines)):
            if "?" in lines[i]:
                list_questions.append(lines[i][j:].replace("  "," ").strip())
    list_questions = list(set(list_questions))
    return list_questions


In [9]:
def getQueryEngine(a_llm, docs):
    service_context = ServiceContext.from_defaults(
        chunk_size=1024,
        llm=a_llm,
        embed_model=embed_model
    )
    index = VectorStoreIndex.from_documents(docs, service_context=service_context)
    return index.as_query_engine()

In [10]:
def create_qa_set(text, docs):
    # print('performing qa for: '+text)
    # print('docs text: '+str(docs))
    qa_pairs = []
    q_llm = getLLMforPrompts(q_system_prompt, query_wrapper_prompt)
    result_llm = q_llm.complete(text).text
    # print(result_llm)
    list_questions = getAllQuestions(result_llm)
    print(" Questions generated for this part - "+str(len(list_questions)))
    # print("questions: "+str(list_questions))
    q_engine = getQueryEngine(getLLMforPrompts(a_system_prompt, query_wrapper_prompt), docs)
    j=1
    for ques in list_questions :
        ans = q_engine.query(ques)
        qa_pairs.append([ques, ans.response.strip()])
        print("  Done with question no. - "+str(j))
        j=j+1
    print(" Q and A pairs for this part - "+str(len(qa_pairs)))
    return qa_pairs

In [11]:
import modin.pandas as pd
#this method takes two pages at a time in a sliding window fashion and generates questions and answers for them
def create_qa_set_for_documents(documents_list):
    all_pairs = []
    for i in range(0, len(documents_list)-1):
        print("Document pair no. - "+str(i))
        text = documents_list[i].text
        if i < len(documents_list)-1 :
            text = text + documents_list[i+1].text
        documents_pair = [documents_list[i].copy()]
        documents_pair[0].text = text
        all_pairs.extend(create_qa_set(text, documents_pair))
        print("Total Pairs so far: "+str(len(all_pairs)))
    return all_pairs

In [12]:
import os
from PyPDF2 import PdfMerger
x = [a for a in os.listdir('../supreme-court-data/data/data/') if a.endswith(".pdf")]
x

['83561-1992___jonew__judis__19509.pdf',
 '8352-2016___jonew__judis__43552.pdf',
 '8352-2008___jonew__judis__44860.pdf',
 '8353-1998___jonew__judis__26113.pdf',
 '83593-1992___jonew__judis__11145.pdf',
 '83558-1992___jonew__judis__11341.pdf',
 '83570-1992___jonew__judis__11231.pdf',
 '8353-1997___jonew__judis__19100.pdf',
 '8354-2006___jonew__judis__31471.pdf',
 '9985-2001___jonew__judis__18016.pdf',
 '8358-1997___jonew__judis__20264.pdf',
 '9999-2017___supremecourt__2017__9999__9999_2017_31_1504_27750_Judgement_20-Apr-2021.pdf',
 '8356-2015___supremecourt__2015__8356__8356_2015_Judgement_12-Oct-2017.pdf',
 '8359-2004___jonew__judis__26717.pdf',
 '8352-2000___jonew__judis__33988.pdf',
 '83548-1992___jonew__judis__11391.pdf',
 '9923-2017___supremecourt__2017__9923__9923_2017_Judgement_23-Feb-2018.pdf',
 '83550-1992___jonew__judis__11378.pdf',
 '83552-1992___jonew__judis__11366.pdf',
 '9985-2001___jonew__judis__18029.pdf',
 '8355-2005___jonew__judis__34093.pdf']

In [13]:
# import numpy as np
# x = np.random.choice(x, size=50, replace=False)

In [None]:
loader = PDFReader()
data = []
for pdf in x:
    try:
        documents = loader.load_data(file=Path('../supreme-court-data/data/data/'+pdf))
    except:
        print("error")
    data.extend(create_qa_set_for_documents(documents))



Document pair no. - 0
 Questions generated for this part - 0


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 Q and A pairs for this part - 0
Total Pairs so far: 0
Document pair no. - 1
 Questions generated for this part - 3


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Done with question no. - 1


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Done with question no. - 2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Done with question no. - 3
 Q and A pairs for this part - 3
Total Pairs so far: 3
Document pair no. - 2


In [None]:
# print(data)
len(data)

In [None]:
df = pd.DataFrame(data, columns = ['Question', 'Answer'])

In [None]:
# df = pd.concat([df, df2], ignore_index=True)
df

In [None]:
df.to_csv('final_data.csv', index=False, encoding='utf-8')

In [None]:
# user_queries = ['Seller address in the document?' , 'Client address in the document?', 'seller Tax Id in the document?' ] #@param

# answer = dict()
# for i, user_query in enumerate(user_queries):
#     query_engine = index.as_query_engine()
#     response = query_engine.query(user_query)
#     print(response.response)