In [9]:
from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse
from fastapi.middleware.cors import CORSMiddleware
from pyngrok import ngrok
import nest_asyncio
import uvicorn
from unsloth import FastLanguageModel
import torch
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate, PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
import langchain
from operator import itemgetter
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.prompts import ChatMessagePromptTemplate, PromptTemplate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from langchain.embeddings import HuggingFaceBgeEmbeddings

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [10]:
app = FastAPI()
received_prompt = ""

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

# Model configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True

# Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Innovent/trained_model",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: NVIDIA RTX A4500. Max memory: 19.696 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [11]:
prompt_template = """Below is a prompt that describes any question a user has or a problem being faced by the user. Write a response that appropriately helps the user answer his question or give the steps to troubleshoot his problem.

### userPrompt:
{}

### Response:
{}"""

In [12]:
gemini = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    verbose=True,
    temperature=0,
    google_api_key="AIzaSyBpVRT86uMPCk7tKX_q-x3Ula8U8ucaiMA",
)

I0000 00:00:1722337611.881950 1218290 config.cc:230] gRPC experiments enabled: call_status_override_on_cancellation, event_engine_dns, event_engine_listener, http2_stats_fix, monitoring_experiment, pick_first_new, trace_record_callops, work_serializer_clears_time_cache


In [13]:
def extract_score(response):
  for line in response.split("\n"):
    if line.strip().isdigit():
      return int(line.strip())
  return None

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 500,
)
def load_pdf(path):
  loader = PyPDFLoader(path)
  document = loader.load()
  text = text_splitter.split_documents(document)
  return text

In [None]:
def generate_queries(original_query):
  prompt = ChatPromptTemplate(input_variables=['original_query'], messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='You are a helpful assistant that generates multiple search queries based on a single input query which is related to troubleshooting a car.')), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['original_query'], template='Generate multiple search queries related to: {question} \n OUTPUT (4 queries):'))])
  generate_queries = (
    prompt | gemini | StrOutputParser() | (lambda x: x.split("\n"))
  )
  return generate_queries.invoke(original_query)

def vector_search(query, all_documents):
    documents = list(all_documents.values())
    doc_names = list(all_documents.keys())

    # Combine the query and documents
    combined = [query] + documents

    # Vectorize the combined texts
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined)

    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    # Get similarity scores and sort them
    scores = cosine_sim.flatten()
    score_dict = {doc_names[i]: round(scores[i], 2) for i in range(len(scores))}
    sorted_scores = {doc: score for doc, score in sorted(score_dict.items(), key=lambda x: x[1], reverse=True)}

    return sorted_scores


def reciprocal_rank_fusion_docs(search_results_dict, k=60):
    fused_scores = {}
    print("Initial individual search result ranks:")
    for query, doc_scores in search_results_dict.items():
        print(f"For query '{query}': {doc_scores}")

    for query, doc_scores in search_results_dict.items():
        for rank, (doc, score) in enumerate(sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)):
            if doc not in fused_scores:
                fused_scores[doc] = 0
            previous_score = fused_scores[doc]
            fused_scores[doc] += 1 / (rank + k)
            print(f"Updating score for {doc} from {previous_score} to {fused_scores[doc]} based on rank {rank} in query '{query}'")

    reranked_results = {doc: score for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)}
    print("Final reranked results:", reranked_results)
    return reranked_results


model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

embedding_function = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs,

)

In [None]:
def generate_fusion_two(original_query, selected_car):
    normal_docs = {}
    str_docs = {}
    folder_path = f'/content/drive/MyDrive/Innovent/manuals/{selected_car}'
    for filename in os.listdir(folder_path):
      if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        splitted = load_pdf(file_path)
        normal_docs[filename] = splitted
        str_docs[filename] = str(splitted)
    queries = generate_queries(original_query)
    all_results = {}
    for query in queries:
      search_results = vector_search(query, str_docs)
      all_results[query] = search_results
    reranked_results = reciprocal_rank_fusion_docs(all_results)
    required_doc = list(reranked_results.keys())[0] # Takes the top ranked doc from all of them
    print("SEARCHING IN: ", required_doc)
    required_context = normal_docs[required_doc]
    db = Chroma.from_documents(required_context, embedding_function)
    retriever = db.as_retriever(k=5)
    template = """You have been provided with the context, the user is asking to troubleshoot, you have to use the context to answer the query
    {context}

    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(template)
    # langchain.debug = True
    chain = (
      {"context": retriever, "question": RunnablePassthrough()}
      | prompt
      | gemini
      | StrOutputParser()
    )
    return chain.invoke(original_query)


In [14]:
def optimize_query(input_text):
  prompt = ChatPromptTemplate(
  input_variables=['query'],
  messages=[
    SystemMessagePromptTemplate(prompt=PromptTemplate(template='You are a helpful assistant that evaluates the grammatical quality of car troubleshooting queries strictly, returning only a numerical score between 1 and 10.')),
    HumanMessagePromptTemplate(prompt=PromptTemplate(template='Evaluate the grammatical quality of this query: {query}\nOnly return the score (1-10):'))
  ]
  )
  generate_score = (
    prompt | gemini | StrOutputParser()
  )
  response = generate_score.invoke(input_text)
  score = extract_score(response)
  print(score)
  if((score != None) and (score <=7)):
    print("Query not good with a score of: ", score)
    prompt2 = ChatPromptTemplate(
    input_variables=['query'],
      messages=[
          SystemMessagePromptTemplate(prompt=PromptTemplate(template='You are a helpful assistant that improves car troubleshooting queries by correcting grammar and making them more clear and elaborate.')),
          HumanMessagePromptTemplate(prompt=PromptTemplate(template='Improve the grammar and clarity of this query: {query}\nOutput only the improved query:'))
      ]
    )
    output_query = (
      prompt2 | gemini | StrOutputParser()
    )
    input_text = output_query.invoke(input_text)
    print(input_text)

  return input_text

In [15]:
@app.post('/generate')
async def generate(prompt_data: dict):
    global received_prompt
    try:
        # Extract the prompt from the incoming JSON payload
        # print("Hello")
        selected_car = prompt_data['selected_car']
        input_text = prompt_data['prompt']
        print(selected_car)
        print(input_text)
        # input_text = prompt_data['prompt']

        optimized_query = optimize_query(input_text)
        
        if not optimized_query:
            raise HTTPException(status_code=400, detail="Prompt is required")

        # Make the first letter of the optimized query lowercase
        optimized_query = optimized_query[0].lower() + optimized_query[1:]
        final_prompt = f"In {selected_car}, {optimized_query}"
        rag = generate_fusion_two(optimized_query, selected_car)
        inputs = tokenizer(
            [
                prompt_template.format(
                    final_prompt,  # instruction
                    "",  # output - leave this blank for generation!
                )
            ], return_tensors="pt").to("cuda")

        outputs = model.generate(**inputs, max_new_tokens=200, use_cache=True)
        generated_text = tokenizer.batch_decode(outputs)

        return {"generated_text": generated_text[0], "input_text": optimized_query, "rag": rag}
    except KeyError:
        raise HTTPException(status_code=400, detail="Prompt not found in request body")

# Get your authtoken from https://dashboard.ngrok.com/get-started/your-authtoken
auth_token = "2jomqKcchgXe2FCWs69mV1tAFwG_35XAms6dCCEztbwKiWKmA"

# Set the authtoken
ngrok.set_auth_token(auth_token)

ngrok_tunnel = ngrok.connect(8000, domain="peaceful-personally-tadpole.ngrok-free.app")
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

                                                                                                    

I0000 00:00:1722337625.950196 1218290 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722337625.951041 1218290 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
I0000 00:00:1722337626.003197 1218290 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722337626.004033 1218290 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
INFO:     Started server process [1218290]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Public URL: https://peaceful-personally-tadpole.ngrok-free.app
Tata Punch
how activate wind shield wiper fluid
6
Query not good with a score of:  6
How do I activate the windshield wiper fluid in my car? 



I0000 00:00:1722337660.389722 1218290 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722337660.390629 1218290 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
I0000 00:00:1722337661.257867 1218290 work_stealing_thread_pool.cc:320] WorkStealingThreadPoolImpl::PrepareFork
I0000 00:00:1722337661.258622 1218290 fork_posix.cc:77] Other threads are currently calling into gRPC, skipping fork() handlers
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment v

INFO:     103.86.182.226:0 - "POST /generate HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1218290]
