In [1]:
! pip install flash-attn tiktoken einops triton python-dotenv pytest

Collecting flash-attn
  Using cached flash_attn-2.6.3-cp39-cp39-linux_x86_64.whl
Collecting tiktoken
  Using cached tiktoken-0.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting pytest
  Using cached pytest-8.3.2-py3-none-any.whl.metadata (7.5 kB)
Collecting iniconfig (from pytest)
  Using cached iniconfig-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pluggy<2,>=1.5 (from pytest)
  Using cached pluggy-1.5.0-py3-none-any.whl.metadata (4.8 kB)
Collecting tomli>=1 (from pytest)
  Using cached tomli-2.0.1-py3-none-any.whl.metadata (8.9 kB)
Using cached tiktoken-0.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
Using cached python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Using cached pytest-8.3.2-py3-none-any.whl (341 kB)
Using cached pluggy-1.5.0-py3-none-any.whl (20 kB)
Using cached tomli-2.0.1-py3-none-any.whl (12 kB)
Using cached i

In [2]:
import json
import os
from dotenv import load_dotenv
from minsearch import Index

load_dotenv()

os.environ["HF_HOME"] = "/run/cache/"

In [4]:
from huggingface_hub import login

login(token = os.environ['MISTRAL_TOKEN'])

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /run/cache/token
Login successful


In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

torch.random.manual_seed(0)
model_id = "mistralai/Mistral-7B-v0.1"
model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype="auto",
  
    
   load_in_4bit = True
)
assert torch.cuda.is_available(), "This model needs a GPU to run ..."
device = torch.cuda.current_device()
#model = model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [10]:
print(torch.cuda.is_available(), torch.cuda.current_device())

True 0


In [14]:
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_flash_sdp(False)

In [23]:
prompt = "Do you love me ?"


inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

generate_ids = model.generate(**inputs, max_length = 60)

ans = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [24]:
ans

'Do you love me ?\n\nI’m not sure if I’m a good person.\n\nI’m not sure if I’m a bad person.\n\nI’m not sure if I’m a good person.\n\nI’m not sure if I'

In [50]:
generator = pipeline("text-generation", model = model, tokenizer = tokenizer)

In [25]:
documents = []

with open('documents.json', 'r') as file:

    docs = json.load(file)
    
    

In [26]:
for doc in docs:

    for document in doc['documents']:

        document['course'] = doc['course'] 
        documents.append(document)

In [27]:
textfields = ["text", "section", "question"]


indobject = Index(text_fields = textfields, keyword_fields = ['course'])
indobject.fit(documents)

<minsearch.Index at 0x7f4d324b1eb0>

In [28]:
def search(query, boost_dict = {"question": 3}, filter_dict =  {"course":"mlops-zoomcamp"}, num_results = 5 ):

    context = indobject.search(query = query, boost_dict = boost_dict, filter_dict = filter_dict, num_results = num_results
                    )
    
    return context

In [65]:
def build_prompt(query, related_docs):
    
    prompt = """
    Only generate the answer for the query based on the context given
    
    question:{query}
    
    context:{context}
    
    answer:
    
    
    """.strip()
    
    
    context = ""
    for doc in related_docs:
        
        context += f"question: {doc['question']}\nanswer: {doc['text']} \n\n"
        
    
    prompt = prompt.format(query = query, context = context).strip()
    
    return prompt

In [66]:
def chat(prompt, generation_args = {}):
    
    
    
    ans = generator(prompt.strip(), max_length = 900, temperature = 0.7, top_p = 0.95, num_return_sequences = 1) 
    print(ans)
    
    return ans[0]['generated_text']
    
    
    
    

In [67]:
def rag(query, boost_dict = {"question": 3}, course_filter =  {"course": "data-engineering-zoomcamp"}, generation_args ={}):
    
    context = search(query = query, boost_dict = boost_dict, filter_dict = course_filter)
    
    prompt = build_prompt(query, context)
    
    answer = chat(prompt, generation_args = generation_args)
    
    return answer[len(prompt):]
    

In [68]:
query = "How to run spark engine ?"

In [69]:
answer = rag(query)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[{'generated_text': 'Only generate the answer for the query based on the context given\n    \n    question:How to run spark engine ?\n    \n    context:question: How to spark standalone cluster is run on windows OS\nanswer: Change the working directory to the spark directory:\nif you have setup up your SPARK_HOME variable, use the following;\ncd %SPARK_HOME%\nif not, use the following;\ncd <path to spark installation>\nCreating a Local Spark Cluster\nTo start Spark Master:\nbin\\spark-class org.apache.spark.deploy.master.Master --host localhost\nStarting up a cluster:\nbin\\spark-class org.apache.spark.deploy.worker.Worker spark://localhost:7077 --host localhost \n\nquestion: Docker engine stopped_failed to fetch extensions\nanswer: The docker will keep on crashing continuously\nNot working after restart\ndocker engine stopped\nAnd failed to fetch extensions pop ups will on screen non-stop\nSolution :\nTry checking if latest version of docker is installed / Try updating the docker\nIf 

In [70]:
answer

'\n    spark-submit --class org.apache.spark.sql.streaming.StreamingExamples \\\n    --master yarn \\\n    --deploy-mode cluster \\\n    --executor-memory 1G \\\n    --num-executors 1 \\\n    --executor-cores 1 \\\n    --queue default \\\n    --files hdfs:///user/spark/examples/jars/spark-examples_2.11-2.3.0.jar \\\n    --jars hdfs:///'

In [71]:
print(_)


    spark-submit --class org.apache.spark.sql.streaming.StreamingExamples \
    --master yarn \
    --deploy-mode cluster \
    --executor-memory 1G \
    --num-executors 1 \
    --executor-cores 1 \
    --queue default \
    --files hdfs:///user/spark/examples/jars/spark-examples_2.11-2.3.0.jar \
    --jars hdfs:///
