# RAG with DIY-Search and FLAN-T5

In [4]:
import os
os.environ['HF_HOME'] = '/run/cache/' # DON'T FORGET THIS, ELSE DISK CACHE WILL RUN OUT OF SPACE WHEN YOU DOWNLOAD XL MODEL

import json
import requests
import minsearch

In [15]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs = requests.get(docs_url).json()

In [16]:
documents = []

for course_dict in docs:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [17]:
index = minsearch.Index(text_fields=["question", "text", "section"],
                        keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7fcb26de8c70>

In [18]:
def search(q):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )
    return results

In [19]:
def build_prompt(question, context):
    prompt = """You are a chatbot answering frequently asked questions for an online course. 
    Provide ANSWER to the QUESTION based on the CONTEXT given below. 
    Only state the facts from the CONTEXT else respond that information is not available.
    
    QUESTION = {question}
    CONTEXT = {context}
    ANSWER
    """
    prompt_output = prompt.format(question = question, context=context)
    return prompt_output

# Google's FLAN-T5-XL

In [1]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         100G   50G   51G  50% /
tmpfs            64M     0   64M   0% /dev
tmpfs           7.7G     0  7.7G   0% /sys/fs/cgroup
/dev/nvme0n1p1  100G   50G   51G  50% /run
tmpfs            14G     0   14G   0% /dev/shm
/dev/nvme2n1    9.8G  1.9G  7.9G  20% /home/jovyan
tmpfs            14G  120K   14G   1% /home/jovyan/.saturn
tmpfs            14G   12K   14G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           7.7G   12K  7.7G   1% /proc/driver/nvidia
tmpfs           7.7G  8.9M  7.7G   1% /run/nvidia-persistenced/socket
tmpfs           7.7G     0  7.7G   0% /proc/acpi
tmpfs           7.7G     0  7.7G   0% /sys/firmware


In [3]:
# pip install accelerate
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")

model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [1]:
def llm(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    outputs = model.generate(input_ids)
    result = tokenizer.decode(outputs[0])
    return result

In [25]:
def llm(prompt, generate_params=None):
    if generate_params is None:
        generate_params = {}

    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    outputs = model.generate(input_ids,
                            max_length=generate_params.get("max_length", 100),
                            num_beams=generate_params.get("num_beams", 5),
                            do_sample=generate_params.get("do_sample", False),
                            temperature=generate_params.get("temperature", 1.0),
                            top_k=generate_params.get("top_k", 50),
                            top_p=generate_params.get("top_p", 0.95),
                            )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

In [2]:
def rag_response(q):
    context = search(q)
    prompt = build_prompt(q, context)
    answer = llm(prompt)
    return answer

In [3]:
query = "I just discovered the course, can i still join?"
rag_response(query)

NameError: name 'search' is not defined