In [None]:
%%capture
!pip install ibm-watsonx-ai
!pip install langchain-ibm
!pip install langchain-community
!pip install langchain
!pip install gradio
!pip install pypdf
!pip install chromadb

In [None]:
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames
from ibm_watsonx_ai import Credentials
from langchain_ibm import WatsonxLLM, WatsonxEmbeddings
from langchain.text_splitter import Language, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.chains import RetrievalQA
import gradio as gr
# You can use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
warnings.filterwarnings('ignore')

# Task 1: Load document using LangChain for different sources

In [None]:
def document_loader(file):
    loader = PyPDFLoader(file)
    pages = loader.load_and_split()
    return pages

In [None]:
pdf_url = 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/WgM1DaUn2SYPcCg_It57tA/A-Comprehensive-Review-of-Low-Rank-Adaptation-in-Large-Language-Models-for-Efficient-Parameter-Tuning-1.pdf'
data = document_loader(pdf_url)
text = " ".join([page.page_content for page in data])
first_1000_char = text[:1000]
first_1000_char

'A Comprehensive Review of Low-Rank\nAdaptation in Large Language Models for\nEfficient Parameter Tuning\nSeptember 10, 2024\nAbstract\nNatural Language Processing (NLP) often involves pre-training large\nmodels on extensive datasets and then adapting them for specific tasks\nthrough fine-tuning. However, as these models grow larger, like GPT-3\nwith 175 billion parameters, fully fine-tuning them becomes computa-\ntionally expensive. We propose a novel method called LoRA (Low-Rank\nAdaptation) that significantly reduces the overhead by freezing the orig-\ninal model weights and only training small rank decomposition matrices.\nThis leads to up to 10,000 times fewer trainable parameters and reduces\nGPU memory usage by three times. LoRA not only maintains but some-\ntimes surpasses fine-tuning performance on models like RoBERTa, De-\nBERTa, GPT-2, and GPT-3. Unlike other methods, LoRA introduces\nno extra latency during inference, making it more efficient for practical\napplications. Al

# Task 2: Apply text splitting techniques

In [None]:
def text_splitter(data):
    text_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.LATEX,
        chunk_size=60,
        chunk_overlap=5,
        length_function=len,
    )
    chunks = text_splitter.split_text(data)
    return chunks

In [None]:
latex_text = """

    \documentclass{article}

    \begin{document}

    \maketitle

    \section{Introduction}

    Large language models (LLMs) are a type of machine learning model that can be trained on vast amounts of text data to generate human-like language. In recent years, LLMs have made significant advances in various natural language processing tasks, including language translation, text generation, and sentiment analysis.

    \subsection{History of LLMs}

The earliest LLMs were developed in the 1980s and 1990s, but they were limited by the amount of data that could be processed and the computational power available at the time. In the past decade, however, advances in hardware and software have made it possible to train LLMs on massive datasets, leading to significant improvements in performance.

\subsection{Applications of LLMs}

LLMs have many applications in the industry, including chatbots, content creation, and virtual assistants. They can also be used in academia for research in linguistics, psychology, and computational linguistics.

\end{document}

"""

chunks = text_splitter(latex_text)
chunks

['\\documentclass{article}\n\n    \x08egin{document}',
 '\\maketitle\n\n    \\section{Introduction}\n\n    Large',
 'language models (LLMs) are a type of machine learning model',
 'that can be trained on vast amounts of text data to',
 'to generate human-like language. In recent years, LLMs have',
 'have made significant advances in various natural language',
 'processing tasks, including language translation, text',
 'text generation, and sentiment analysis.',
 '\\subsection{History of LLMs}\n\nThe earliest LLMs were',
 'were developed in the 1980s and 1990s, but they were',
 'were limited by the amount of data that could be processed',
 'and the computational power available at the time. In the',
 'the past decade, however, advances in hardware and software',
 'have made it possible to train LLMs on massive datasets,',
 'leading to significant improvements in performance.',
 '\\subsection{Applications of LLMs}\n\nLLMs have many',
 'many applications in the industry, including chatbot

# Task 3: Embed documents

In [None]:
iam_token = "your_iam_token_here"  # Replace with the IAM token you just obtained

def watsonx_embedding():
    embed_params = {
        EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
        EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
    }

    # Initialize WatsonxEmbeddings with IAM token
    watsonx_embedding = WatsonxEmbeddings(
        model_id="ibm/slate-125m-english-rtrvr",
        url="https://us-south.ml.cloud.ibm.com",
        project_id="skills-network",
        params=embed_params,
    )

    return watsonx_embedding

In [None]:
query = "How are you?"
embed = watsonx_embedding()

embedding_vec = embed.embed_documents([query])
embedding_vec[0][:5]

[-0.06722455, -0.023730014, 0.017487874, -0.013195301, -0.03958462]

# Task 4: Create and configure vector databases to store embeddings

In [None]:
!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt'

--2025-02-01 12:24:01--  https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/Ec5f3KYU1CpbKRp1whFLZw/new-Policies.txt
Resolving cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)... 169.63.118.104
Connecting to cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud (cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud)|169.63.118.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6363 (6.2K) [text/plain]
Saving to: ‘new-Policies.txt’


2025-02-01 12:24:01 (679 MB/s) - ‘new-Policies.txt’ saved [6363/6363]



In [None]:
loader = TextLoader('new-Policies.txt')
text = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=10,
    length_function=len
)

chunks = text_splitter.split_documents(text)

embed_params = {
    EmbedTextParamsMetaNames.TRUNCATE_INPUT_TOKENS: 3,
    EmbedTextParamsMetaNames.RETURN_OPTIONS: {"input_text": True},
}

# Initialize WatsonxEmbeddings with IAM token
watsonx_embedding = WatsonxEmbeddings(
    model_id="ibm/slate-125m-english-rtrvr",
    url="https://us-south.ml.cloud.ibm.com",
    project_id="skills-network",
    params=embed_params,
)
vectordb = Chroma.from_documents(chunks, watsonx_embedding)
query = "Smoking policy"
search_result = vectordb.similarity_search(query, k=5)
search_result

[Document(metadata={'source': 'new-Policies.txt'}, page_content='this policy. Regular reviews will ensure it remains relevant with changing technology and security'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='The policy is regularly reviewed to stay current with evolving technology and security best'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='This policy encourages the responsible use of mobile devices in line with legal and ethical')]

# Task 5: Develop a retriever to fetch document segments based on queries

In [None]:
query = "Email policy"
vectordb = Chroma.from_documents(chunks, watsonx_embedding)
retriever = vectordb.as_retriever(search_kwargs={'k': 2})
retriever.invoke(query)

[Document(metadata={'source': 'new-Policies.txt'}, page_content='and email use, including copyright and data protection laws.'),
 Document(metadata={'source': 'new-Policies.txt'}, page_content='and email use, including copyright and data protection laws.')]