# Using FAISS and Lamini for RAG Pipeline

## Step 1: Import Libraries
Import the necessary libraries, including FAISS, Lamini, and jsonlines to handle embeddings and data processing.

In [1]:
import faiss
import lamini
import jsonlines

## Step 2: Set Parameters
Define parameters like the number of nearest neighbors to be returned.

In [2]:
# Number of nearest chunks to return
k = 2


## Step 3: Initialize Variables
Set up placeholders for the index and corresponding plain text splits. Instantiate Lamini's embedding client.

In [3]:
# Set up for the index, which holds the embeddings, and the splits, which holds the corresponding plain text
index = None
splits = []

# Instantiate Lamini's embedding client
embedding_client = lamini.Embedding()

### Step 4: Create Embeddings for Each Transcript
Read the document and create the embedding

In [4]:
import os
from pdf2image import convert_from_path
import pytesseract

# Function to extract text from a PDF
def read_file(pdf_path: str):
    """
    Reads a PDF file using pytesseract and returns the extracted text.
    Args:
        pdf_path (str): Path to the PDF file.
    """
    try:
        # Convert PDF to images
        pages = convert_from_path(pdf_path, 500)        
        # Extract text from each page
        all_text = ""
        for page_num, img_blob in enumerate(pages):
            text = pytesseract.image_to_string(img_blob, lang='eng')
            all_text += text + "\n"
    except Exception as e:
        print(f"Error reading file {pdf_path}: {e}")
    return all_text

# Loop through all PDF files in the /documents directory
documents_dir = "documents"
for filename in os.listdir(documents_dir):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(documents_dir, filename)
        raw_text = read_file(pdf_path)
        
        # Create embedding for the extracted text
        transcript_embedding = embedding_client.generate(raw_text)
        if not index:
            index = faiss.IndexFlatL2(transcript_embedding.size) # Set the size of the index based on model embedding size
        index.add(transcript_embedding)
        splits.append(raw_text)


## Step 5: Create Embedding for the Question
Create an embedding for the user's question to find the most relevant text from the index.

In [5]:
# Define the question and create its embedding
question = "Explain this case and what the documents mean. list the title, explain the title and the number of pages"
question_embedding = embedding_client.generate(question)

## Step 6: Find Nearest Neighbors
Use FAISS to find the top ( k ) nearest neighbors for the question embedding.

In [6]:
# Find the k nearest neighbors and retrieve relevant data
distances, indices = index.search(question_embedding, 10)
relevant_data = [splits[i] for i in indices[0] if i >= 0]
indices

array([[14, 10, 12, 11, 15,  2,  6,  8, 13,  1]])

### Step 7: Instantiate LLM Client
Initialize Lamini's LLM client for generating the final response based on the relevant data.

In [None]:
# Instantiate Lamini's LLM client
llm = lamini.Lamini(model_name="meta-llama/Meta-Llama-3.1-8B-Instruct")

## Step 8: Prepare the Prompt
Construct a prompt for Lamini's LLM using the retrieved relevant data and the question.

In [8]:
# Form the prompt using the retrieved data and the question
prompt = f"""
<|begin_of_text|><|start_header_id|>user<|end_header_id|>
Please answer the following question as if you are an experienced lawyer. Your answer should be accurate but also easily understood by the general public.
{relevant_data}
{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""
relevant_data

['sn onicie WA\n\nCLERK’S OFFICE :\nUNITED STATES DISTRICT AND BANKRUPTCY GQURESp\nFOR THE DISTRICT OF COLUMBIA STATE OF Gig QURT\n\n333 CONSTITUTION AVENUE, NW HOMA\n\nWASHINGTON, DC 20001 APR 6 2024\n; JOHN D\nApril 11, 2024 - HADD\nP CLERK EN\n\nThe Oklahoma Supreme Court\n\n2100 N. Lincoln Blvd., Suite 1 #4 ? ? 1 098\nOklahoma City, OK 73105\n\nIN RE: Case No.: 20-cv-02167-TJK\n\nDear Clerk,\n\nEnclosed are the following documents:\n1. Certified copy of Order Certifying Question of Law to the Supreme Court of Oklahoma.\n2. Copy of full docket sheet.\n\n3. Copy of abridged docket sheet, including only the pleadings designated by the Court.\n4. Copies of designated pleadings.\n\nPlease return a receipt copy of this letter for our records.\n\nThank you for your assistance.\n\nSincerely.\n\nANGELA D. CAESAR, CLERK\n\nOperations Supervisor\n(202) 354-3164\n\n',
 'wan N Un\n\nCase 1:20-cv-02167-TJK Document 195 Filed 04/08/24 Page 1 of 9\nCESS AL FILED\nORIGINAL = 4PREEBour\n\nUNITED STA

#### Step 9: Generate the Response
Use the constructed prompt to generate the answer to the user's question with Lamini's LLM.

In [9]:
# Generate the answer using Lamini's LLM
response = llm.generate(prompt)
print(response)

RequestTimeoutError: RequestTimeoutError