In [3]:
# command-line utility is used for monitoring and managing NVIDIA GPU devices. This tool provides detailed information about the GPU, such as
# utilization, memory usage, temperature, and power consumption.
!nvidia-smi

Fri Jul 19 18:20:34 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:00:1E.0 Off |                    0 |
| N/A   30C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

### Libraries installed
!pip install -U transformers accelerate bitsandbytes sentencepiece
1. **bitsandbytes** - a lightweight and efficient library primarily designed for 8-bit optimizers and quantization - provides optimizers that use 8-bit arithmetic, significantly reducing memory consumption - useful for reducing memory usage and speeding up training and inference in deep learning models, without significant loss of accuracy.

2. **accelerate** - library by Hugging Face that provides a simple and efficient interface for distributed training and mixed precision in PyTorch - abstracts the complexities of multi-GPU or multi-node training and mixed precision training, making it easier to implement and run

3. **sentencepiece** - an unsupervised text tokenizer and detokenizer mainly used for natural language processing (NLP) tasks. Developed by Google, it is designed to handle text tokenization in a language-agnostic way, which is crucial for training NLP models on various languages and text corpora

In [2]:
# -f flag stands for "force." When used, it forces the deletion of the file. This means:
# 1. if the file is write-protected, it will be removed without prompting for confirmation.
# 2. if the file minsearch.py does not exist, no error message will be displayed.
!rm -f minsearch.py
# obtain the minsearch.py file - this would be used to search the and create the context to be used in llm prompt
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-19 18:19:21--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-19 18:19:21 (53.9 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [4]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x7f7c79a3c1f0>

In [5]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [19]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
    outputs = model.generate(input_ids,max_length=100)
    result = tokenizer.decode(outputs[0])
    return result

In [7]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [8]:
# To display information about disk space usage in a human-readable format
# !df -h
# Specify the location where the model needs to be downloaded - by default, it is stored in the .cache folder in the home directory
import os
os.environ['HF_HOME'] = '/run/cache'

In [9]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
# T5Tokenizer is a tokenizer specifically designed for the T5 model - performs tokenization, encoding (converts tokens into input IDs that can be fed into the model) 
# and decoding (converts model output IDs back into human-readable text).
# T5ForConditionalGeneration is a model class for the T5 model architecture used for tasks that involve generating text conditioned on some input (tasks like 
# translation, summarization, and text generation based on the input).

In [10]:
# We would use google/flan-t5-xl model from the hugging face library
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
# When device_map="auto" is used, the Transformers library will automatically determine the best way to distribute the model components across the available devices. 
# This helps in efficiently using the available hardware resources.
# device_map=None: The model will be loaded onto the default device (e.g., the GPU or CPU specified in your PyTorch setup).
# device_map="cpu": Forces the entire model to be loaded onto the CPU.
# device_map="cuda": Forces the entire model to be loaded onto the GPU.
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map="auto")

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [20]:
query = "I just discovered the course. Can I still join it?"
rag(query)

"<pad> Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.</s>"

In [11]:
# input_text = "translate English to German: How old are you?"
# -- return_tensors="pt": This argument specifies that the output should be returned as PyTorch tensors - it indicates
# -- that the tokenized output should be formatted as tensors compatible with PyTorch
# -- .input_ids: The tokenizer converts the input text into a dictionary containing tokenized representations. 
# -- Typically, this includes input_ids, attention_mask, and possibly other fields. input_ids are the tokenized 
# -- representations of your input text, where each token is mapped to an integer ID.
# -- The .to("cuda") method transfers the tensor to the GPU memory.
# input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

# outputs = model.generate(input_ids)
# result = tokenizer.decode(outputs[0])



<pad> Wie alt sind Sie?</s>
