In [1]:
import random
import gradio as gr
import time
from pypdf import PdfReader
import google.generativeai as genai

from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction



In [11]:
def helper_read_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    pdf_texts = [p.extract_text().strip() for p in reader.pages]  # length of pdf_texts is the number of pages of upload pdf.
    pdf_texts = [text for text in pdf_texts if text]
    
    print (f"the pdf text length: {len(pdf_texts)}, '\n', the content of pdf_texts[0]: {pdf_texts[0][:100]}")
    time.sleep(1)

    # text splitter
    character_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", ". ", " ", ""],
        chunk_size=1000,
        chunk_overlap=0
    )
    character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))
    print(f"Character split into {len(character_split_texts)} chunks.")

    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)
    token_split_texts = [token_split for text in character_split_texts for token_split in token_splitter.split_text(text)]
    print(f"Token split into {len(token_split_texts)} chunks.")

    print('the following are the chunks from this upload pdf: ')
    for i in range(len(token_split_texts)):
        print(token_split_texts[i])
        print('\n')
        
    return token_split_texts


In [18]:

def augment_example_answer(query):
    messages = [
        {
            "role": "model",
            "parts": "I have gived you a query about resume pdf files. Provide an example answer to the given question, that might be found in a document like a resume. "
        },
        {"role": "user", "parts": query}
    ] 

    response = model.generate_content(messages)
    contents = response.text
    # print('contents from augment_example_answer: \n ', contents)
    
    # response = openai_client.chat.completions.create(
    #     model=model,
    #     messages=messages,
    # )
    # contents = response.choices[0].message.content
    return contents

In [19]:

def augment_multiple_query(query):
    messages = [
        {
            "role": "model",
            "parts": "You are an experienced recruiter. Your users are asking questions about resumes. "
            "Suggest up to five additional related questions to help them find the information they need, for the provided question. "
            "Suggest only short questions without compound sentences. Suggest a variety of questions that cover different aspects of the topic."
            "Make sure they are complete questions, and that they are related to the original question."
            "Output one question per line. Do not number the questions."
        },
        {"role": "user", "parts": query}
    ]

    response = model.generate_content(messages)
    contents = response.text
    # print('contents from augment_multiple_query: ', contents)
    contents = contents.split("\n") # return list
    
    # response = openai_client.chat.completions.create(
    #     model=model,
    #     messages=messages,
    # )
    # content = response.choices[0].message.content
    # contents = content.split("\n")
    return contents
    

In [20]:
def count_files(message, history):
    
    text_message = message['text']  # get text from message
    file_paths = message['files']
    num_files = len(file_paths)

    yield f"You uploaded {num_files} files, the message is: {message}."
    time.sleep(1)
    if not chroma_collection.get()['ids'] and not message['files']:  # 没有上传file，并且chroma没有存储之前上传的pdf
        yield f'there are no data in chroma collection. \n {chroma_collection.get()}, and no upload pdf!'
        return

    print('the ids of chroma_collection: ', chroma_collection.get()['ids'])

    # read the first file
    # if upload file from message:
    if message['files']:
        pdf_path = file_paths[0]

        token_split_texts = helper_read_pdf(pdf_path)
        
        # reader = PdfReader(pdf_path)
        # pdf_texts = [p.extract_text().strip() for p in
        #              reader.pages]  # length of pdf_texts is the number of pages of upload pdf.
        # pdf_texts = [text for text in pdf_texts if text]
        # yield f"the pdf text length: {len(pdf_texts)}, '\n', the content of pdf_texts[0]: {pdf_texts[0][:100]}"
        # time.sleep(1)

        # # text splitter
        # character_splitter = RecursiveCharacterTextSplitter(
        #     separators=["\n\n", "\n", ". ", " ", ""],
        #     chunk_size=1000,
        #     chunk_overlap=0
        # )
        # character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))
        # print(f"Character split into {len(character_split_texts)} chunks.")

        # token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)
        # token_split_texts = [token_split for text in character_split_texts for token_split in
        #                      token_splitter.split_text(text)]
        # print(f"Token split into {len(token_split_texts)} chunks.")

        max_id = max(map(int, chroma_collection.get()['ids']), default=0)
        ids = [str(i) for i in range(max_id + 1, max_id + 1 + len(token_split_texts))]
        chroma_collection.add(ids=ids, documents=token_split_texts)
        print(f"Added {len(ids)} chunks to Chroma collection.")

    ids_str = str(chroma_collection.get()['ids']) # get all ids from collection

    original_query = text_message

    # method 1: get hypothetical answer from the original query
    hypothetical_answer = augment_example_answer(original_query) # get example answer
    print('\n hypothetical answer: ', hypothetical_answer)
    
    joint_query = f"{original_query} {hypothetical_answer}"
    ##

    # method 2: get multi-querys from original query
    augmented_queries = augment_multiple_query(original_query)
    print('\n multi_querys: ', augmented_queries)

    joint_query = [original_query] + augmented_queries
    ##
    
    results = chroma_collection.query(query_texts=joint_query, n_results=5)
    # print(results)
    
    # results = chroma_collection.query(query_texts=[query], n_results=5)
    retrieved_documents = results['documents'][0]

    information = "\n\n".join(retrieved_documents)
    # information = pdf_texts

    prompt = [
        {
            "role": "model",
            "parts": "answer the question based on the information."
        },
        {"role": "user", "parts": f"Question: {original_query}. \n Information: {information}"}
    ]
    response = model.generate_content(prompt)
    contents = response.text
    yield f"the ids_str: {ids_str},\n the original_query is: {original_query}, \n the information is: {information}, \n\n the contents: {contents}"
    # time.sleep(3)
    # yield contents


In [21]:
def create_collections(coll_name):
    embedding_function = SentenceTransformerEmbeddingFunction()
    chroma_client = chromadb.Client()
    
    collections = chroma_client.list_collections()
    collection_names = [collection.name for collection in collections]

    # if coll_name not exist in the collection_names
    if coll_name not in collection_names:
        chroma_coll = chroma_client.create_collection(coll_name, embedding_function=embedding_function)
        print(f"Collection {coll_name} created.")
    else: # already exists this collection
        chroma_coll = chroma_client.get_collection(coll_name)
        print(f"Collection {coll_name} already exists.")

    return chroma_coll

In [22]:
if __name__ == "__main__":
    Google_API = "AIzaSyBvTMTgsk9jF5oCbdMBiEUfPEl5F2dZM68"
    genai.configure(api_key=Google_API)

    # chroma
    collection_name = "test1"
    chroma_collection = create_collections(collection_name)

    model = genai.GenerativeModel(model_name="gemini-pro")

    demo = gr.ChatInterface(fn=count_files, examples=[{"text": "Hello", "files": []}], title="RAG Chat Bot",
                            multimodal=True)

    demo.launch()

Collection test1 already exists.
Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.


the ids of chroma_collection:  ['1', '2', '3', '4', '5', '6', '7']

 hypothetical answer:  **Example Answer:**

**Objective:**

Seeking a challenging and rewarding position in the field of [field] where I can utilize my [skills] and [experience] to contribute to the success of the organization.

**Skills:**

* [Skill 1]
* [Skill 2]
* [Skill 3]

**Experience:**

**Position**, [Company Name], [Dates]

* [Responsibility 1]
* [Responsibility 2]
* [Responsibility 3]

**Education:**

**Degree**, [University Name], [Dates]

**Certifications:**

* [Certification 1]
* [Certification 2]

**Awards and Recognition:**

* [Award 1]
* [Award 2]

**References:**

Available upon request.

 multi_querys:  ['- What are the most important sections to include on a resume?', '- How long should a resume be?', '- What is the best font to use for a resume?', '- Should I include a photo on my resume?', '- What are some common mistakes to avoid when writing a resume?']
the ids of chroma_collection:  ['1', '2', '



Token split into 5 chunks.
the following are the chunks from this upload pdf: 
kangyi qiu / [UNK] @ gatech. edu [UNK] - 436 - 9016 / linkedinlinkedin. com / kangyi - qiu education georgia institute of technology, college of computing aug 2022 - dec 2024 master of computer science, school of computer science atlanta, ga peking university, school of electronics engineering and computer science sep 2018 - july 2022 bachelor of science, major in machine intelligence beijing, china relevant coursework : computational science & engineering algorithms, artiﬁcial intelligence, data & visual analytics, introduction to computer systems, the brain and cognitive science, computer vision and deep learning. work experience sony china co., ltd. research center jul. 2023 – nov. 2023 computer vision intern beijing, china • trained person re - identiﬁcation ( reid ) model based on resnet using the momentum contrast ( moco ) method with the pass and imagenet datasets, leveraging python and tensorflow 2. 