In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

True

## 1.Indexing

Text embedding

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

# from langchain_community.embeddings.spacy_embeddings import SpacyEmbeddings

# embedder = SpacyEmbeddings(model_name="en_core_web_sm")
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# testing the embedder
embeddings = embedder.embed_documents(
    [
        "Hi there!",
        "Oh, hello!",
        "What's your name?",
        "My friends call me World",
        "Hello World!"
    ]
)
len(embeddings), len(embeddings[0])

(5, 768)

Document Loader

In [5]:
import json
import os
from langchain.schema import Document

datapath = '../data/data_info.txt'

with open(datapath, "r") as file:
    raw_data = file.read()

corpus = json.loads(raw_data)
corpus[0].keys()

dict_keys(['title', 'abstract', 'keywords', 'year', 'doi', 'authors', 'full text', 'pages', 'content'])

In [6]:
docs = []

for thesis in corpus:
    document = Document(
    page_content=thesis['content'],
    metadata={
        "title": thesis['title'],
        #"authors": thesis['authors'],
        "year": thesis['year'],
        #"keywords": thesis['keywords']
    })
    docs.append(document)

print(len(docs))
print(docs[0].metadata)

100
{'title': 'A Critical Survey on the use of Fuzzy Sets in Speech and Natural Language Processing', 'year': 2012}


Splitter

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
splits = text_splitter.split_documents(docs)

splits

[Document(metadata={'title': 'A Critical Survey on the use of Fuzzy Sets in Speech and Natural Language Processing', 'year': 2012}, page_content='A Critical Survey on the use of Fuzzy Sets in Speech \nand Natural Language Processing \n \nJoao P. Carvalho \nTULisbon – Instituto Superior \nTécnico \nL2F - INESC -ID \nR. Alves Redol 9, 1000 -029 Lisboa  \njoao.carvalho@inesc -id.pt Fernando  Batista  \nISCTE -IUL – Lisbon University \nInstitute  \nL2F - INESC -ID \nR. Alves Redol 9, 1000- 029 Lisboa  \nfernando .batista@inesc -id.pt Luisa Coheur  \nTULisbon – Instituto Superior \nTécnico \nL2F - INESC -ID \nR. Alves Redol 9, 1000 -029 Lisboa  \nluisa.coheur@inesc -id.pt \n \n \nAbstract  — This paper  shows how the use and applications of \nFuzzy Sets (FS) in S peech and Natural Language Processing \n(SNLP) have  seen a steady decline to a point where  FS are \nvirtually unknown or unappealing for most of the researchers \ncurrently working in the SNLP  field, trie s to find t he reasons 

Vectorstore

In [9]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=embedder) # Spacy Embeddings

retriever = vectorstore.as_retriever()

# 2. Retrieval

In [23]:
import datetime
from typing import Literal, Optional, Tuple
from pydantic import BaseModel, Field

class ThesisSearch(BaseModel):
    """Search and summarize over a corpus of thesis about a certain field or knowledge."""

    query: str = Field(
        ...,
        description="The user's search query for academic content summarization.",
    )
    
    min_publish_date: Optional[datetime.date] = Field(
        None,
        description="Earliest publish date filter, inclusive.",
    )
    max_publish_date: Optional[datetime.date] = Field(
        None,
        description="Latest publish date filter, exclusive.",
    )

    def pretty_print(self) -> None:
        """Prints all specified fields with their values."""
        for field in self.__fields__:
            value = getattr(self, field)
            if value is not None:
                print(f"{field}: {value}")


In [34]:
from tools.customllm import RedPillLLM
from langchain_core.prompts import ChatPromptTemplate

system = """You are an expert at converting user questions into database queries. \
You have access to a database of corpus containing academic thesis. \
Given a question, return a database query optimized to retrieve the most relevant results.

If there are acronyms or words you are not familiar with, do not try to rephrase them."""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

llm = RedPillLLM(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0)
structured_llm = llm.with_structured_output(ThesisSearch, method="function_calling", strict=True)
query_analyzer = prompt | structured_llm

NotImplementedError: 

In [10]:
retriever = vectorstore.as_retriever(search_kwargs={"k":5},
                                     search_type="similarity")

docs = retriever.invoke("What is Task Decomposition?")


In [11]:
for doc in docs:
    print(doc.page_content)
    print('\n')

In this way, the model is trained more evenly for different tasks towards the end of the training
processto reduceinter-taskinterference.Wangetal. [ 138]d e fi n eαas
α(e)=min/parenleftBig
αm,(e−1)αm−α0
M+α0/parenrightBig
,
whereα0andαmdenote initial and maximum values of α. The noise level of the self-supervised
denoising autoencoding task is scheduled similarly, increasing difficulty after a warm-up period.
In both works, temperature αincreases during training which encourages up-sampling of low-
resourcetasksand alleviates overfitting.
3.4 Task Scheduling
Task scheduling determines the order of tasks on which an MTL model is trained. A naive way
is to train all tasks together. Zhang et al. [ 161] take this way to train an MTL model, where data
batches are organized as four-dimensional tensors of size N×M×T×d,w h e r eNdenotes the
number of samples, Mdenotes the number of tasks, Tdenotes sequence length, and drepresents
embedding dimensions. Similarly, Zalmout and Habash [ 156] putla

# 3. Generation

Prompt Template

In [12]:
from langchain.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context:
{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='Answer the question based only on the following context:\n{context}\n\nQuestion: {question}\n'), additional_kwargs={})])

Custom LLM

In [13]:
from tools.customllm import RedPillLLM

llm = RedPillLLM(model="gpt-4o", 
                 api_key=os.getenv("RED_PILL_API_KEY"),
                 temperature = 0)

Rag Chains

In [14]:
from langchain import hub
prompt_hub_rag = hub.pull("rlm/rag-prompt")

In [15]:
prompt_hub_rag

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

RAG chains

In [17]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

answer = rag_chain.invoke("What is Task Decomposition?")

In [18]:
print(answer)

The provided context does not explicitly define "Task Decomposition." However, based on the context of multi-task learning in natural language processing, task decomposition generally refers to the process of breaking down a complex task into smaller, more manageable sub-tasks. This approach can help in organizing and scheduling tasks more effectively, allowing models to focus on specific aspects of a task at different stages of training. Task decomposition can also facilitate the integration of various tasks within a multi-task learning framework, potentially improving the overall performance by reducing inter-task interference and optimizing task scheduling.
