In [11]:
import os
import sys
sys.path.insert(0, os.path.abspath(os.path.join('..')))

In [12]:
import glob
import json
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from transformers import AutoTokenizer
from torch import cuda
from langchain_community.embeddings import HuggingFaceEmbeddings, HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import Qdrant
from qdrant_client import QdrantClient
from reports import files, report_list
from langchain.docstore.document import Document
import configparser
import logging


# read all the necessary variables
device = 'cuda' if cuda.is_available() else 'mps'
path_to_data = "../../reports/"       


In [52]:
##---------------------fucntions -------------------------------------------##
def getconfig(configfile_path:str):
    """
    configfile_path: file path of .cfg file
    """

    config = configparser.ConfigParser()

    try:
        config.read_file(open(configfile_path))
        return config
    except:
        logging.warning("config file not found")
        
def open_file(filepath):
    with open(filepath) as file:
        simple_json = json.load(file)
    return simple_json

def load_chunks():
    """
    this method reads through the files and report_list to create the vector database
    """

    #  we iterate through the files which contain information about its
    # 'source'=='category', 'subtype', these are used in UI for document selection
    #  which will be used later for filtering database
    config = getconfig("../../model_params.cfg")
    all_documents = {}
    categories = list(files.keys())
    # iterate through 'source'
    for category in categories:
        print("documents splitting in source:",category)
        all_documents[category] = []
        subtypes = list(files[category].keys())
        # iterate through 'subtype' within the source
        # example source/category == 'District', has subtypes which is district names
        for subtype in subtypes:
            print("document splitting for subtype:",subtype)
            for file in files[category][subtype]:

                # load the chunks
                try:
                    doc_processed = open_file(path_to_data + file + "/"+ file+ ".chunks.json" )

                
                except Exception as e:
                    print("Exception: ", e)
                print("chunks in subtype:",subtype, "are:",len(doc_processed))

                # add metadata information 
                chunks_list = []
                for doc in doc_processed:
                    chunks_list.append(Document(page_content= doc['content'], 
                             metadata={"source": category,
                                      "subtype":subtype,
                                      "year":file[-4:],
                                      "filename":file,
                                      "page":doc['metadata']['page'],
                                      "headings":doc['metadata']['headings']}))

                all_documents[category].append(chunks_list)
    
    # convert list of list to flat list
    for key, docs_processed in all_documents.items():
        docs_processed = [item for sublist in docs_processed for item in sublist]
        print("length of chunks in source:",key, "are:",len(docs_processed))
        all_documents[key] = docs_processed
    all_documents['allreports'] = [sublist for key,sublist in all_documents.items()]
    all_documents['allreports'] = [item for sublist in all_documents['allreports'] for item in sublist]
    # define embedding model
    embeddings = HuggingFaceEmbeddings(
        model_kwargs = {'device': device},
        encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
        model_name=config.get('retriever','MODEL')
    )
    # placeholder for collection
    qdrant_collections = {}
    
    
    for file,value in all_documents.items():
        if file == "allreports":
            print("emebddings for:",file)
            qdrant_collections[file] = Qdrant.from_documents(
                value,
                embeddings,
                path="../../qdrant/",
                #location=":memory:", 
                collection_name="uganda-documents",
            )
    print(qdrant_collections)
    print("vector embeddings done")
    return all_documents, qdrant_collections, embeddings

In [53]:
all_documents, vectorstores, embeddings = load_chunks()

documents splitting in source: Consolidated
document splitting for subtype: Consolidated Annnual Report
chunks in subtype: Consolidated Annnual Report are: 2382
chunks in subtype: Consolidated Annnual Report are: 3485
length of chunks in source: Consolidated are: 5867
emebddings for: allreports
{'allreports': <langchain_community.vectorstores.qdrant.Qdrant object at 0x1398415a0>}
vector embeddings done


In [62]:
from langchain_community.vectorstores.qdrant import Qdrant
from langchain_qdrant import QdrantVectorStore
from langchain_community.vectorstores import Qdrant

config = getconfig("../../model_params.cfg")

vector_store = Qdrant.from_existing_collection(
    path="../../qdrant/", collection_name="uganda-documents",
    embedding = HuggingFaceEmbeddings(
        model_kwargs = {'device': device},
        encode_kwargs = {'normalize_embeddings': bool(int(config.get('retriever','NORMALIZE')))},
        model_name=config.get('retriever','MODEL')
    )
)