In [5]:
import fitz  # PyMuPDF
from qdrant_client import QdrantClient
from qdrant_client.http.models import VectorParams, CreateCollection, PointStruct
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Initialize Qdrant client
qdrant_client = QdrantClient(host="localhost", port=6333)

# Function to extract text from PDF and split into chunks
def extract_pdf_chunks(file_path, chunk_size=100):
    doc = fitz.open(file_path)
    text_chunks = []
    for page_num in range(doc.page_count):
        page_text = doc.load_page(page_num).get_text("text")
        words = page_text.split()
        # Split text into chunks
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i + chunk_size])
            text_chunks.append(chunk)
    return text_chunks

# Function to convert text to vectors using TF-IDF
def text_to_vectors(text_chunks):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(text_chunks)
    return vectors.toarray(), vectorizer

# Process the PDF
pdf_path = "2305.14314v1.pdf"
text_chunks = extract_pdf_chunks(pdf_path)
vectors, vectorizer = text_to_vectors(text_chunks)  # Get vectorizer object as well

# Define the Qdrant collection
collection_name = "pdf_chunks"
vector_size = vectors.shape[1]  # Get the actual size of the vectors produced by TF-IDF
distance = "Cosine"

# Create collection in Qdrant with the correct vector size
qdrant_client.recreate_collection(
    collection_name,
    vectors_config=VectorParams(size=vector_size, distance=distance)
)

# Add chunks to Qdrant
for idx, (chunk, vector) in enumerate(zip(text_chunks, vectors)):
    point = PointStruct(id=idx, vector=vector.tolist(), payload={"text": chunk})
    qdrant_client.upsert(collection_name=collection_name, points=[point])

print("PDF text chunks added to Qdrant.")

  qdrant_client.recreate_collection(


PDF text chunks added to Qdrant.


# Simple Query

In [17]:
# Function to query Qdrant with a text query
def query_qdrant(query_text, vectorizer, top_k=5):
    # Transform the query text using the same TF-IDF vectorizer used for indexing
    query_vector = vectorizer.transform([query_text]).toarray()[0]
    search_result = qdrant_client.search(
        collection_name=collection_name,
        query_vector=query_vector.tolist(),
        limit=top_k
    )
    return search_result

# Example query
query_text = "What is this research paper Qlora about ?"
results = query_qdrant(query_text, vectorizer)  # Pass the vectorizer used for training

# Display results
for result in results:
    print(f"ID: {result.id}, Score: {result.score}, Text: {result.payload['text']}")

ID: 45, Score: 0.16390824, Text: full-finetuning in our experiments with 4-bit finetuning, this raises the question of where the performance-precision trade-off exactly lies for QLoRA tuning, which we leave to future work to explore. We proceed to investigate instruction tuning at scales that would be impossible to explore with full 16-bit finetuning on academic research hardware. 5 Pushing the Chatbot State-of-the-art with QLoRA Having established that 4-bit QLORA matches 16-bit performance across scales, tasks, and datasets we conduct an in-depth study of instruction finetuning up to the largest open-source language models available for research. To assess the performance of instruction finetuning these models, we evaluate
ID: 73, Score: 0.12678413, Text: generated with Nucleus Sampling [25] with p = 0.9. Of course, this is by no means comprehensive, since it is beyond the scope of this small qualitative study to control for all the variables involved, e.g., the full distribution of 

# Modified query using DSPY

In [8]:
import dspy
# Set up the LM.
turbo = dspy.OpenAI(model='gpt-3.5-turbo-instruct', max_tokens=250)
dspy.settings.configure(lm=turbo)
import os
from openaikey import key

os.environ["OPENAI_API_KEY"] = str(key())

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
class CoT(dspy.Module):
    def __init__(self):
        super().__init__()
        self.prog = dspy.ChainOfThought("question -> answer")
    
    def forward(self, question):
        return self.prog(question=question)

In [10]:
cache_turn_on = os.environ.get('DSP_CACHEBOOL', 'True').lower() != 'false'

In [11]:
#Loading:
cot = CoT()
cot.load("optimized_cot_bestprompter.json")

In [19]:
result = cot(question=query_text)
# Extract only the answer
DSPYQuery = result.answer

# Print the extracted answer
print(DSPYQuery)

I will read and analyze the research paper to determine its main topic, research question, and findings. Based on my analysis, I will provide a summary of the paper and its key points.


In [20]:
# Example query
query_text = DSPYQuery
results = query_qdrant(query_text, vectorizer)  # Pass the vectorizer used for training

# Display results
for result in results:
    print(f"ID: {result.id}, Score: {result.score}, Text: {result.payload['text']}")

ID: 32, Score: 0.13921885, Text: which use 16-bit BrainFloat. 4 QLoRA vs. Standard Finetuning We have discussed how QLoRA works and how it can significantly reduce the required memory for finetuning models. The main question now is whether QLoRA can perform as well as full-model finetuning. Furthermore, we want to analyze the components of QLoRA including the impact of NormalFloat4 over standard Float4. The following sections will discuss the experiments that aimed at answering these questions. 3https://docs.nvidia.com/cuda/cuda-c-programming-guide 5
ID: 45, Score: 0.13555625, Text: full-finetuning in our experiments with 4-bit finetuning, this raises the question of where the performance-precision trade-off exactly lies for QLoRA tuning, which we leave to future work to explore. We proceed to investigate instruction tuning at scales that would be impossible to explore with full 16-bit finetuning on academic research hardware. 5 Pushing the Chatbot State-of-the-art with QLoRA Having es

# RAG over Qdrant Chunks using DSPY Query and ColbertV2

In [30]:
# Example query
query_text = DSPYQuery
results = query_qdrant(query_text, vectorizer)  # Pass the vectorizer used for training

# Collect the text chunks into a list
chunks_list = [result.payload['text'] for result in results]

# Join the list into a single string, separated by a space or new line
output_string = ' '.join(chunks_list)

# Print the result
print(output_string)


which use 16-bit BrainFloat. 4 QLoRA vs. Standard Finetuning We have discussed how QLoRA works and how it can significantly reduce the required memory for finetuning models. The main question now is whether QLoRA can perform as well as full-model finetuning. Furthermore, we want to analyze the components of QLoRA including the impact of NormalFloat4 over standard Float4. The following sections will discuss the experiments that aimed at answering these questions. 3https://docs.nvidia.com/cuda/cuda-c-programming-guide 5 full-finetuning in our experiments with 4-bit finetuning, this raises the question of where the performance-precision trade-off exactly lies for QLoRA tuning, which we leave to future work to explore. We proceed to investigate instruction tuning at scales that would be impossible to explore with full 16-bit finetuning on academic research hardware. 5 Pushing the Chatbot State-of-the-art with QLoRA Having established that 4-bit QLORA matches 16-bit performance across scale

In [40]:
import os
from llama_index.core import VectorStoreIndex
from llama_index.postprocessor.colbert_rerank import ColbertRerank
import logging
import sys

In [36]:
from uuid import uuid4

# Create a Document object with metadata
docs = [
    Document(
        id_=str(uuid4()),  # Generates a unique ID for the document
        embedding=None,  # Assuming no precomputed embedding
        metadata={
            'total_pages': 1,  # Set to 1 since it's a single paragraph
            'file_path': 'sample_paragraph.txt',  # Dummy file path
            'source': '1'  # Dummy source number
        },
        excluded_embed_metadata_keys=[],
        excluded_llm_metadata_keys=[],
        relationships={},
        text=output_string,  # The actual content of the document
        mimetype='text/plain',  # MIME type for plain text
        start_char_idx=None,
        end_char_idx=None,
        text_template='{metadata_str}\n\n{content}',  # Template for text formatting
        metadata_template='{key}: {value}',  # Template for metadata formatting
        metadata_seperator='\n'
    )
]

# Ensure the document has content
for doc in docs:
    if not doc.text:
        raise ValueError("Document has no content. Please ensure the document is properly created.")

print(docs)
# Create the index
index = VectorStoreIndex.from_documents(documents=docs)


[Document(id_='ff1a1691-ccc5-4622-bf77-e38a24cc0d28', embedding=None, metadata={'total_pages': 1, 'file_path': 'sample_paragraph.txt', 'source': '1'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='which use 16-bit BrainFloat. 4 QLoRA vs. Standard Finetuning We have discussed how QLoRA works and how it can significantly reduce the required memory for finetuning models. The main question now is whether QLoRA can perform as well as full-model finetuning. Furthermore, we want to analyze the components of QLoRA including the impact of NormalFloat4 over standard Float4. The following sections will discuss the experiments that aimed at answering these questions. 3https://docs.nvidia.com/cuda/cuda-c-programming-guide 5 full-finetuning in our experiments with 4-bit finetuning, this raises the question of where the performance-precision trade-off exactly lies for QLoRA tuning, which we leave to future work to explore. We proceed to investigate instructio

In [37]:
colbert_reranker = ColbertRerank(
    top_n=5,
    model="colbert-ir/colbertv2.0",
    tokenizer="colbert-ir/colbertv2.0",
    keep_retrieval_score=True,
)

# Configure the query engine to include the ColBERT reranker
query_engine = index.as_query_engine(
    similarity_top_k=10,
    node_postprocessors=[colbert_reranker],
)



In [39]:
DSPYQuery

'I will read and analyze the research paper to determine its main topic, research question, and findings. Based on my analysis, I will provide a summary of the paper and its key points.'

In [38]:
response = query_engine.query(
    DSPYQuery,
)

print(response)

The research paper discusses the implementation and evaluation of QLoRA, a method aimed at reducing memory requirements for finetuning models. The main research question revolves around whether QLoRA can match the performance of full-model finetuning. The paper delves into experiments analyzing the components of QLoRA, particularly the impact of NormalFloat4 over standard Float4. It also explores the performance-precision trade-off for QLoRA tuning. Additionally, the paper highlights the significance of instruction tuning and its scalability compared to full 16-bit finetuning. The study concludes by showcasing that 4-bit QLoRA can achieve performance levels comparable to 16-bit models across various scales, tasks, and datasets. Furthermore, it emphasizes the importance of qualitative analysis alongside quantitative evaluations, pointing out potential biases in automated evaluation systems and the need to consider subjective preferences in assessing model performance.
