In [1]:
from dotenv import load_dotenv
import os

load_dotenv()

True

## 1.Indexing

Text embedding

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
# from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings

# embedder = SpacyEmbeddings(model_name="en_core_web_sm")
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# testing the embedder
embeddings = embedder.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

  from .autonotebook import tqdm as notebook_tqdm


Document Loader

In [9]:
import json
import os
import re
from langchain.schema import Document

datapath = '../data/data_info.txt'

with open(datapath, "r") as file:
    raw_data = file.read()

corpus = json.loads(raw_data)
corpus[0].keys()


dict_keys(['title', 'abstract', 'keywords', 'year', 'doi', 'authors', 'full text', 'pages', 'content'])

In [17]:
docs = []

for thesis in corpus:
    document = Document(
    page_content="This is the text content of the document.",
    metadata={
        "title": thesis['title'],
        "authors": thesis['authors'],
        "year": thesis['year'],
        "keywords": thesis['keywords']
    })
    docs.append(document)

print(len(doc))
print(docs[0].metadata)

100
{'title': 'A Critical Survey on the use of Fuzzy Sets in Speech and Natural Language Processing', 'authors': ['Joao P. Carvalho', 'Fernando Batista', 'Luisa Coheur'], 'year': 2012, 'keywords': ['Fuzzy Sets', 'Natural Language Processing', 'Speech processing']}


Splitter

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
splits = text_splitter.split_documents(docs)

splits

[Document(metadata={'title': 'A Critical Survey on the use of Fuzzy Sets in Speech and Natural Language Processing', 'authors': ['Joao P. Carvalho', 'Fernando Batista', 'Luisa Coheur'], 'year': 2012, 'keywords': ['Fuzzy Sets', 'Natural Language Processing', 'Speech processing']}, page_content='This is the text content of the document.'),
 Document(metadata={'title': 'A Memory-Based Approach to Anti-Spam Filtering for Mailing Lists', 'authors': ['GEORGIOS SAKKIS', 'ION ANDROUTSOPOULOS', 'GEORGIOS PALIOURAS', 'VANGELIS KARKALETSIS', 'CONSTANTINE D. SPYROPOULOS', 'PANAGIOTIS STAMATOPOULOS'], 'year': 2003, 'keywords': ['text categorization', 'machine learning', 'unsolicited commercial e-mail', 'spam']}, page_content='This is the text content of the document.'),
 Document(metadata={'title': 'A Probabilistic Generative Model for Mining Cybercriminal Networks from Online Social Media', 'authors': ['Raymond Y.K. Lau', 'Yunqing Xia', 'Yunming Ye'], 'year': 2014, 'keywords': ['cybercriminal netw

In [9]:
# from langchain_core.documents import Document

# splits = [
#     Document(
#         page_content="Dogs are great companions, known for their loyalty and friendliness.",
#         metadata={"source": "mammal-pets-doc"},
#     ),
#     Document(
#         page_content="Cats are independent pets that often enjoy their own space.",
#         metadata={"source": "mammal-pets-doc"},
#     ),
#     Document(
#         page_content="Goldfish are popular pets for beginners, requiring relatively simple care.",
#         metadata={"source": "fish-pets-doc"},
#     ),
#     Document(
#         page_content="Parrots are intelligent birds capable of mimicking human speech.",
#         metadata={"source": "bird-pets-doc"},
#     ),
#     Document(
#         page_content="Rabbits are social animals that need plenty of space to hop around.",
#         metadata={"source": "mammal-pets-doc"},
#     ),
# ]

Vectorstore

In [10]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embedder) # Spacy Embeddings

retriever = vectorstore.as_retriever()

# 2. Retrieval

In [11]:
retriever = vectorstore.as_retriever(search_kwargs={"k":5},
                                     search_type="similarity")

docs = retriever.invoke("What is Task Decomposition?")


In [12]:
for doc in docs:
    print(doc.page_content)
    print('\n')

(2) Model selection: LLM distributes the tasks to expert models, where the request is framed as a multiple-choice question. LLM is presented with a list of models to choose from. Due to the limited context length, task type based filtration is needed.
Instruction:

Given the user request and the call command, the AI assistant helps the user to select a suitable model from a list of models to process the user request. The AI assistant merely outputs the model id of the most appropriate model. The output must be in a strict JSON format: "id": "id", "reason": "your detail reason for the choice". We have a list of models for you to choose from {{ Candidate Models }}. Please select one model from the list.

(3) Task execution: Expert models execute on the specific tasks and log results.
Instruction:

With the input and the inference results, the AI assistant needs to describe the process and results. The previous stages can be formed as - User Input: {{ User Input }}, Task Planning: {{ Task

# 3. Generation

Prompt Template

In [13]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

Custom LLM

In [14]:
from tools.customllm import RedPillLLM

llm = RedPillLLM(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0)

Rag Chains

In [16]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [17]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

RAG chains

In [20]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain.invoke("What is Task Decomposition?")

'Task Decomposition is a process used to break down complicated tasks into smaller and simpler steps. It involves using techniques like Chain of Thought (CoT) and Tree of Thoughts to enhance model performance on complex tasks. CoT prompts the model to "think step by step," transforming big tasks into multiple manageable tasks and providing insight into the model\'s thinking process. Tree of Thoughts extends this by exploring multiple reasoning possibilities at each step, creating a tree structure of thoughts. Task decomposition can be achieved through simple prompting, task-specific instructions, or human inputs.'