In [0]:
z.jup()

In [0]:
z.showUsage()

---CPU---


#0: 1.6%	#1: 0.0%	#2: 6.2%
#3: 1.6%	#4: 6.2%	#5: 0.0%
#6: 3.1%	#7: 0.0%	#8: 0.0%
#9: 1.6%	#10: 4.7%	#11: 3.1%
#12: 1.6%	#13: 4.7%	#14: 1.6%
#15: 0.0%	#16: 0.0%	#17: 0.0%
#18: 1.6%	#19: 1.5%



---Memory---


46360MiB (71%) free, 65237MiB total



---GPU---


#0. Tesla P100-PCIE-16GB


memory: 648MiB (3%) free, 16287MiB total
(46360, [648])


In [0]:
import os
import asyncio
import multiprocessing

from flask import Flask, request, send_file, jsonify
from langchain.load import dumps, loads
from langchain_core.documents.base import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableLambda

from langchain_community.chat_models import ChatLlamaCpp
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_qdrant import Qdrant
from qdrant_client import QdrantClient

from tempfile import NamedTemporaryFile
from httpx import TimeoutException
from tenacity import retry, stop_after_attempt, wait_exponential

GEMMA2_MODEL = "gemma-2-9b-it-Q5_K_L.gguf"
# GEMMA2_MODEL = r"C:\Users\wwhac\Downloads\gemma-2-9b-it-Q5_K_L.gguf"

EMBEDDINGS_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
# EMBEDDINGS_MODEL = "vinai/PhoGPT-4B-Chat"

DASHBOARD = 
QDRANT_URL = 
QDRANT_API_KEY = 

COLLECTION_NAME = "DATA-SOC"
CONTENT_PAYLOAD_KEY = "content"
METADATA_PAYLOAD_KEY = "metadata"

SAVE_PATH = '../data-store'
BATCH_SIZE_UPLOAD = 10

TOP_K = 5
MAX_SAME_QUERY = 1
MAX_DOCS_FOR_CONTEXT = (MAX_SAME_QUERY + 1) * TOP_K

model_kwargs = {"device": "cuda"}
encode_kwargs = {"normalize_embeddings": True}

embeddings = HuggingFaceBgeEmbeddings(
    model_name=EMBEDDINGS_MODEL,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

gemma_2_model = ChatLlamaCpp(
    model_path=GEMMA2_MODEL,
    verbose=False, 
    temperature=0.5,
    n_gpu_layers=-1,  # Avoid using GPU layers if GPU memory is insufficient
    n_ctx=4096,  # Reduce context window size to decrease memory usage
    max_tokens=4096,  # Adjust max tokens to match reduced context
    f16_kv=False,  # Disable fp16 key/value caches to save memory
    n_threads=multiprocessing.cpu_count()-1,  # Use fewer CPU threads
)

app = Flask(
    "SOC-API-CHATBOT"
)

In [0]:
#---Start---llms response----
async def llms_process_template(template: str):
    prompt = PromptTemplate(
        template=template,
        input_variables=[]
    )
    
    chain = (
        prompt 
        | gemma_2_model 
        | StrOutputParser()
    )

    response = await chain.ainvoke({})
    return response

#---End---llms response----

In [0]:
#---Start---query----
def collection_exists(client: QdrantClient, collection_name: str) -> bool:
    """Check if a Qdrant collection exists"""
    collections = client.get_collections().collections
    return any(col.name == collection_name for col in collections)

def existing_collection(collection_name: str) -> Qdrant:
    """Create vector retriever"""
    
    client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
    if not collection_exists(client, collection_name):
        return None
    

    doc_store = Qdrant.from_existing_collection(
        url=QDRANT_URL,
        api_key=QDRANT_API_KEY,
        embedding=embeddings,
        collection_name=collection_name,    
        content_payload_key=CONTENT_PAYLOAD_KEY,
        metadata_payload_key=METADATA_PAYLOAD_KEY
    )
    return doc_store

def reciprocal_rank_fusion(results: list[list], k=60):
    """Rerank docs (reciprocal rank fusion)"""

    fused_scores = {}
    for docs in results:
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    return [x[0] for x in reranked_results[:MAX_DOCS_FOR_CONTEXT]]

async def query_generator(original_query: dict) -> list[str]:
    """Generate queries from original query"""

    user_query = original_query.get("user_query")
    template = f"""
        Đưa ra {MAX_SAME_QUERY} câu hỏi cùng sát với ý nghĩa câu sau: {user_query}. 
        Chỉ sinh bằng tiếng Việt Nam, không sinh thêm văn bản gì!
    """

    queries = await llms_process_template(template)
    queries = (lambda x: x.split("\n"))(queries)
    queries = ((lambda lines: [line.strip() for line in lines if line.strip() != ""]))(queries)
    queries.insert(0, "0. " + user_query)
    
    return queries

async def similarity_search(para: dict) -> list[Document]:
    """RRF retriever"""

    common_doc_store = existing_collection(COLLECTION_NAME)
    user_doc_store = existing_collection(para["user_id"])
    queries = await query_generator(para)
    
    all_results = []
    for q in queries:
        if common_doc_store:
            common_results = common_doc_store.similarity_search_with_score(q, k=TOP_K)
            all_results.append(common_results)

        if user_doc_store:
            user_results = user_doc_store.similarity_search_with_score(q, k=TOP_K)
            all_results.append(user_results)
    
    fused_results = reciprocal_rank_fusion(all_results)
    return fused_results

async def query(user_query: str, user_id: str):
    """Query with vector db"""

    ssearch = RunnableLambda(similarity_search)
    context = await ssearch.ainvoke({'user_query': user_query, 'user_id': user_id})
    context = [c[0].page_content for c in context]
    question = user_query

    template = f"""
        Vui lòng trả lời [câu hỏi] chỉ bằng [thông tin] sau.
        Thông tin: {context}
        Câu hỏi: {question}
        Câu trả lời cuối cùng:
    """

    response = await llms_process_template(template)   
    result = {"context": context, "response": response, "template": template}

    return result


In [0]:
#---Start---upload----
def check_file_existed(folder, filename):
    file_path = os.path.join(SAVE_PATH, folder, filename)
    file_existed = os.path.exists(file_path)
    
    if file_existed:
        return file_path
    return None

async def save_pdf(file, user_id):
    if not os.path.exists(SAVE_PATH):
        os.makedirs(SAVE_PATH)

    folder = SAVE_PATH + "/" + user_id
    if not os.path.exists(folder):
        os.makedirs(folder)

    pdf_content = file.read()
    file_name = file.filename

    with NamedTemporaryFile(delete=False, dir=folder, suffix='.pdf') as temp_file:
        temp_file.write(pdf_content)
        temp_file.close()
        file_name_temp = temp_file.name
    
    file_abs_path = os.path.abspath(os.path.join(folder, file_name))
    os.rename(file_name_temp, file_abs_path)
    
    return file_abs_path
    

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def upload_to_qdrant(docs, user_id):
    try:
        Qdrant.from_documents(
            documents=docs,
            embedding=embeddings,
            url=QDRANT_URL,
            api_key=QDRANT_API_KEY,
            collection_name=user_id,
            content_payload_key=CONTENT_PAYLOAD_KEY,
            metadata_payload_key=METADATA_PAYLOAD_KEY,
        )
    except TimeoutException as e:
        print(f"Timeout occurred: {e}")
        raise 

async def upload_pdf(file_path, user_id):
    text_splitter = SemanticChunker(embeddings=embeddings, breakpoint_threshold_type="percentile")
    raw_documents = PyPDFLoader(file_path).load()
    docs = text_splitter.split_documents(raw_documents)
    
    for i in range(0, len(docs), BATCH_SIZE_UPLOAD):
        batch = docs[i:i + BATCH_SIZE_UPLOAD]
        try:
            upload_to_qdrant(batch, user_id)
        except TimeoutException:
            print(f"Failed to upload batch {i // BATCH_SIZE_UPLOAD + 1}. Moving to the next batch.")
            return False
        except Exception as e:
            print(f"An unexpected error occurred: {e}")
            return False
    
    return True

#---End---upload----

In [0]:
#---Start---API----
@app.route("/ask", methods=["POST"])
async def post_ask():
    user_id = request.form['user_id']
    user_query = request.form['user_query']
    file = request.files.get('file')
    
    if file and file.filename:
        print('Yes')
    else: 
        print('No')
    
    answer = await query(user_query, user_id)    
    return jsonify({
        "message": user_query,
        "answer": answer['response']
    }, 200)

@app.route("/query", methods=["POST"])
async def post_answer():
    data = request.get_json()
    user_query = data.get("user_query")
    user_id = data.get("user_id")

    answer = await query(user_query, user_id)
    return jsonify({
        "message": user_query,
        "answer": answer['response'],
        "context": answer['context'],
        "template": answer['template']
    }), 200

@app.route("/upload", methods=["POST"])
async def post_upload():
    user_id = request.form.get('user_id')

    if 'file' in request.files and user_id:
        file = request.files['file']
        file_existed = check_file_existed(user_id, file.filename)
        
        if file_existed:
            with open(file_existed, 'rb') as file:
                return send_file(file_existed, as_attachment=True, download_name=file.name)
        else:
            file_path = await save_pdf(file, user_id)
            upload_success = await upload_pdf(file_path, user_id)
            if upload_success:
                return {
                    'response': 'Tải lên thành công!'
                }, 200
    
    return {
        'response': 'Tải lên file không thành công, vui lòng kiểm tra lại file pdf của bạn!'
    }, 200

@app.route("/llms", methods=["POST"])
async def post_llms():
    data = request.get_json()
    template = data.get("template")

    response = await llms_process_template(template)
    return jsonify({
        "response": response
    }), 200
    
@app.route("/test", methods=["POST"])
async def post_test():
    data = request.get_json()
    template = data.get("template")
    return template
    
@app.route("/test", methods=["GET"])
async def get_test():
    return "This is a hello world page"
    

In [0]:
asyncio.run(app.run(debug=False, host="0.0.0.0", port=7733, use_reloader=False))

java.net.SocketException: Connection reset
	at java.net.SocketInputStream.read(SocketInputStream.java:210)
	at java.net.SocketInputStream.read(SocketInputStream.java:141)
	at java.io.BufferedInputStream.fill(BufferedInputStream.java:246)
	at java.io.BufferedInputStream.read1(BufferedInputStream.java:286)
	at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
	at org.apache.thrift.transport.TIOStreamTransport.read(TIOStreamTransport.java:127)
	at org.apache.thrift.transport.TTransport.readAll(TTransport.java:86)
	at org.apache.thrift.protocol.TBinaryProtocol.readAll(TBinaryProtocol.java:429)
	at org.apache.thrift.protocol.TBinaryProtocol.readI32(TBinaryProtocol.java:318)
	at org.apache.thrift.protocol.TBinaryProtocol.readMessageBegin(TBinaryProtocol.java:219)
	at org.apache.thrift.TServiceClient.receiveBase(TServiceClient.java:69)
	at org.apache.zeppelin.interpreter.thrift.RemoteInterpreterService$Client.recv_interpret(RemoteInterpreterService.java:274)
	at org.apache.zeppel