# Pronova LLM Train Model #
## Use this notebook to do the following ##
- Create Qdrant collections
- Chunk text files into smaller chunks
- Create embeddings for data chunks and querys
- Delete qdrant vectors

In [2]:
# Load require librarys
import os
from qdrant_client import QdrantClient
from qdrant_client.http import models
from openai import OpenAI
from dotenv import load_dotenv
from IPython.display import Markdown, display

# Load environment variables from .env file
load_dotenv()

True

### Setup Qdrant connection ###

In [3]:
# Get the Qdrant API key from the environment variable
Qdrant_api_key = os.getenv('Qdrant_API_KEY')
if not Qdrant_api_key:
    raise ValueError("No Qdrant API key found in environment variables")
Qdrant_url = os.getenv('Qdrant_URL')
if not Qdrant_url:
    raise ValueError("No Qdrant URL found in environment variables")


# Initialize Qdrant client
try:
    Qclient = QdrantClient(
        url= Qdrant_url,
        api_key=Qdrant_api_key
    )
    print("Successfully connected to Qdrant")
except Exception as e:
    print(f"Failed to connect to Qdrant: {e}")
    raise

Successfully connected to Qdrant


### Setup OpenAI connection ###

In [4]:
# Get the OpenAI API key from the environment variable
OpenAI_api_key = os.getenv('OPENAI_API_KEY')
if not OpenAI_api_key:
    raise ValueError("No OpenAI API key found in environment variables")

OpenAI.api_key = OpenAI_api_key

### Creating a Qdrant Collection (function) ###

In [5]:
#Create collection
def create_qdrant_collection(collection_name):
    try:
        Qclient.create_collection(
            collection_name=collection_name,
            vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE),
        )
        print(f"Collection '{collection_name}' created successfully")
    except Exception as e:
        print(f"Failed to create collection '{collection_name}': {e}")
        raise

### Get an OpenAI embedding from a text segment (Function) ###

In [6]:
# Function to get the embedding of a text
def get_embedding(text):
    client = OpenAI()
    response = client.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    return response.data[0].embedding

### Turn a file into chunks (Function) ###

In [7]:
# Function to read a text file and chunk its content
def chunk_text_from_file(file_path, chunk_size=400):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]

### Get embedding from a list of chunks (Function) ###

In [8]:
# Function to get embeddings for a list of text chunks
def get_embeddings_for_chunks(chunks):
    embeddings = []
    for chunk in chunks:
        embedding = get_embedding(chunk)
        embeddings.append(embedding)
    return embeddings

### Get largest Qdrant ID (function) ###

In [9]:
def get_qdrant_next_id(collection_name):
    try:
        collection = Qclient.get_collection(collection_name=collection_name)
        id = collection.points_count
        if id == None:
            # print("id was zero")
            return 0
        else:
            # print(f"Next ID in collection '{collection_name}' is {id}")
            return id
    except Exception as e:
        print(f"Failed to get next ID from collection '{collection_name}': {e}")
        raise

### Upsert lists of documents to Qdrant (Function) ###
#### Mind the parameters ####

In [10]:
# function to upsert embeddings into Qdrant
# def upsert_embeddings(collection_name, embeddings, chunks):
def upsert_embeddings(collection_name, embeddings, chunks, filename):
    largest_id = get_qdrant_next_id(collection_name)
    points = []
    for i in range(len(embeddings)):
        points.append(
            {
                "id": i + largest_id + 1,
                "vector": embeddings[i],
                "payload": {
                    "text": chunks[i],   # Attach the chunk as payload
                    "source_file": filename  # Add source file
                }            
            }
        )

    try:
        Qclient.upsert(
            collection_name=collection_name,
            points=points
        )
        print("Embeddings upserted successfully")
    except Exception as e:
        print(f"Failed to upsert embeddings: {e}")
        raise

### Delete all entries in a collection effectively resetting a model (Function) ###
**BE CAREFUL WITH THIS.** *(It is secured by a verification process)*

In [11]:
def reset_model(collection_name):
    confirmation = input("Type 'DELETE' to confirm deletion of entries in " + collection_name + ". This cannot be undone. Type anything else to abort")
    if confirmation == "DELETE" :
        try:
            Qclient.delete_collection(collection_name=collection_name)
            print(f"Collection '{collection_name}' deleted successfully")
            create_qdrant_collection(collection_name)
        except Exception as e:
            print(f"Failed to delete collection '{collection_name}': {e}")
            raise
    else:
        print("Deletion aborted")

### Process Files From Folder (function) ###

In [12]:
def process_files_from_folder(collection_name, file_path, limit):
    
    #number of files in the folder 
    num_files = len([name for name in os.listdir(file_path) if os.path.isfile(os.path.join(file_path, name))])
    if limit == None:
        limit = num_files
    current_file = 1

    for filename in os.listdir(file_path):
        if current_file > limit:
            break
        curr_file_path = os.path.join(file_path, filename)
        print(f"Processing file {current_file} of max {limit}: {filename}")
        
        # # Chunk the file
        chunks = chunk_text_from_file(curr_file_path)
        # Get embeddings for the chunks
        embeddings = get_embeddings_for_chunks(chunks)
        # Upsert the embeddings into Qdrant
        upsert_embeddings(collection_name, embeddings, chunks, filename)
        current_file += 1
