# Implementation of a solution using Retrieval Augmented Generation with LLM based on LlamaIndex framework

In [5]:
import logging
import sys


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [6]:
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index import ServiceContext, StorageContext

In [7]:
from llama_index.storage.docstore import SimpleDocumentStore
from llama_index.storage.index_store import SimpleIndexStore
from llama_index.vector_stores import SimpleVectorStore

# create storage context using default stores
storage_context = StorageContext.from_defaults(
    docstore=SimpleDocumentStore(),
    vector_store=SimpleVectorStore(),
    index_store=SimpleIndexStore(),
)

In [8]:
# from llama_index.vector_stores import ChromaVectorStore
# import chromadb


# chroma_client = chromadb.PersistentClient()
# chroma_collection = chroma_client.create_collection("quickstart")
# vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
# storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [9]:
from llama_index.llms import Ollama


llm = Ollama(model="mistral")

In [10]:
# Modify context (documents splitter, llm)
service_context = ServiceContext.from_defaults(chunk_size=1000, llm=llm)

******
Could not load OpenAIEmbedding. Using HuggingFaceBgeEmbeddings with model_name=BAAI/bge-small-en. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

******
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
Starting new HTTPS connection (1): huggingface.co:443
Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-small-en/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /BAAI/bge-small-en/resolve/main/config.json HTTP/1.1" 200 0
https://huggingface.co:443 "HEAD /BAAI/bge-small-en/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 140358059793600 on /tmp/llama_index/models/models--BAA

Downloading config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140358059793600 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e610cfadfe462add405a5495e10c55a6a11e15f4.lock
Attempting to release lock 140358059793600 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e610cfadfe462add405a5495e10c55a6a11e15f4.lock
Attempting to release lock 140358059793600 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e610cfadfe462add405a5495e10c55a6a11e15f4.lock
DEBUG:filelock:Lock 140358059793600 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e610cfadfe462add405a5495e10c55a6a11e15f4.lock
Lock 140358059793600 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e610cfadfe462add405a5495e10c55a6a11e15f4.lock
Lock 140358059793600 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e610cfadfe462add405a5495e10c55a6a11e15f4.lock
INFO:torch.distributed.nn.jit.instantiator:Created a temporary directory at /tmp/tmpqe4m6wds
Created a temporary di

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140357427670240 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e3f93baaef0f082d01814223d65942241664d215849a5b76a95f9d390da379b6.lock
Attempting to release lock 140357427670240 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e3f93baaef0f082d01814223d65942241664d215849a5b76a95f9d390da379b6.lock
Attempting to release lock 140357427670240 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e3f93baaef0f082d01814223d65942241664d215849a5b76a95f9d390da379b6.lock
DEBUG:filelock:Lock 140357427670240 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e3f93baaef0f082d01814223d65942241664d215849a5b76a95f9d390da379b6.lock
Lock 140357427670240 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e3f93baaef0f082d01814223d65942241664d215849a5b76a95f9d390da379b6.lock
Lock 140357427670240 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/e3f93baaef0f082d01814223d65942241664d2158

Downloading tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140360624828624 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/37fca74771bc76a8e01178ce3a6055a0995f8093.lock
Attempting to release lock 140360624828624 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/37fca74771bc76a8e01178ce3a6055a0995f8093.lock
Attempting to release lock 140360624828624 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/37fca74771bc76a8e01178ce3a6055a0995f8093.lock
DEBUG:filelock:Lock 140360624828624 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/37fca74771bc76a8e01178ce3a6055a0995f8093.lock
Lock 140360624828624 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/37fca74771bc76a8e01178ce3a6055a0995f8093.lock
Lock 140360624828624 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/37fca74771bc76a8e01178ce3a6055a0995f8093.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-small-en/resolve/main/vocab.txt HTTP/1.1" 2

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140357372962768 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
Attempting to release lock 140357372962768 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
Attempting to release lock 140357372962768 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
DEBUG:filelock:Lock 140357372962768 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
Lock 140357372962768 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
Lock 140357372962768 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-small-en/resolve/main/tokenizer.json HTTP/1

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140357372967712 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/688882a79f44442ddc1f60d70334a7ff5df0fb47.lock
Attempting to release lock 140357372967712 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/688882a79f44442ddc1f60d70334a7ff5df0fb47.lock
Attempting to release lock 140357372967712 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/688882a79f44442ddc1f60d70334a7ff5df0fb47.lock
DEBUG:filelock:Lock 140357372967712 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/688882a79f44442ddc1f60d70334a7ff5df0fb47.lock
Lock 140357372967712 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/688882a79f44442ddc1f60d70334a7ff5df0fb47.lock
Lock 140357372967712 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/688882a79f44442ddc1f60d70334a7ff5df0fb47.lock
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /BAAI/bge-small-en/resolve/main/added_tokens.json HTT

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140357372963152 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/a8b3208c2884c4efb86e49300fdd3dc877220cdf.lock
Attempting to release lock 140357372963152 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/a8b3208c2884c4efb86e49300fdd3dc877220cdf.lock
Attempting to release lock 140357372963152 on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/a8b3208c2884c4efb86e49300fdd3dc877220cdf.lock
DEBUG:filelock:Lock 140357372963152 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/a8b3208c2884c4efb86e49300fdd3dc877220cdf.lock
Lock 140357372963152 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/a8b3208c2884c4efb86e49300fdd3dc877220cdf.lock
Lock 140357372963152 released on /tmp/llama_index/models/models--BAAI--bge-small-en/blobs/a8b3208c2884c4efb86e49300fdd3dc877220cdf.lock


In [16]:
import os
import shutil


def copy_text_files(source_dir, destination_dir):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # Traverse the source directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            # Check if the file has a .txt extension (you can modify this condition)
            if file.endswith(".txt") or file.endswith(".md"):
                source_path = os.path.join(root, file)
                destination_path = os.path.join(destination_dir, 'raw_' + root.split('/')[-1] + '_' + file)

                # Copy the file to the destination directory
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file}")

# Replace 'source_directory' and 'destination_directory' with your actual paths
copy_text_files('../parsed', '../parsed/raw')

Copied: 11. Volunteering & Innopoints.md
Copied: 2. Accommodation & Meal Plan.md
Copied: 5. How to Order References?.md
Copied: 12. Migration Control for International Students.md
Copied: 9.5. Exchange Opportunities.md
Copied: 13. Library.md
Copied: 1. Academic Plan.md
Copied: 9. Rules of registration for the military service:.md
Copied: 8. Scholarships & Financial Support.md
Copied: 10. Your Health.md
Copied: Extracurricular.md
Copied: Admin.md
Copied: Academic.md
Copied: Household.md
Copied: apply.innopolis.university_master.txt
Copied: apply.innopolis.university_grant.txt
Copied: innopolis.university_sveden_apply.txt
Copied: apply.innopolis.university_olymp-math.txt
Copied: apply.innopolis.ru_get-in.txt
Copied: apply.innopolis.university_olympiad-bonus.txt
Copied: apply.innopolis.university_faq.txt
Copied: career.innopolis.university.txt
Copied: itproductdevelopment.md
Copied: venturecapitalhacksfromzerotonegotiatinganinvestmentdeal.tex.md
Copied: introductiontoitentrepreneurship.te

In [17]:
docs_dir = "../parsed/raw"
documents = SimpleDirectoryReader(docs_dir).load_data()

DEBUG:llama_index.readers.file.base:> [SimpleDirectoryReader] Total files added: 1230
> [SimpleDirectoryReader] Total files added: 1230
> [SimpleDirectoryReader] Total files added: 1230


In [18]:
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, storage_context=storage_context
)

DEBUG:llama_index.node_parser.node_utils:> Adding chunk: All:Schedule






Current Schedul...
> Adding chunk: All:Schedule






Current Schedul...
> Adding chunk: All:Schedule






Current Schedul...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: ALL:StudyPlan






Current Study...
> Adding chunk: ALL:StudyPlan






Current Study...
> Adding chunk: ALL:StudyPlan






Current Study...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Key concepts of the class


* Identification of...
> Adding chunk: Key concepts of the class


* Identification of...
> Adding chunk: Key concepts of the class


* Identification of...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: What is the purpose of this course?


After the...
> Adding chunk: What is the purpose of this course?


After the...
> Adding chunk: What is the purpose of this course?


After the...
DEBUG:llama_index.node_parser.node_utils:> Adding chunk: Course objectives based on Bloom’s taxonomy
> Adding chu

In [20]:
query_engine = index.as_query_engine(similarity_top_k=2, response_mode="compact", streaming=True)  # Modify retriever
response = query_engine.query("Who is 319?")

DEBUG:llama_index.indices.utils:> Top 2 nodes:
> [Node 25bd83ee-ee51-4ccd-992d-09d014848f22] [Similarity score:             0.824961] What are the working hours of 319 office?
Monday to Friday from 14:00 to 18:00
> [Node cb1b6a2f-b4e1-444c-9e32-9ee0267ce4d2] [Similarity score:             0.824961] What are the working hours of 319 office?
Monday to Friday from 14:00 to 18:00
> Top 2 nodes:
> [Node 25bd83ee-ee51-4ccd-992d-09d014848f22] [Similarity score:             0.824961] What are the working hours of 319 office?
Monday to Friday from 14:00 to 18:00
> [Node cb1b6a2f-b4e1-444c-9e32-9ee0267ce4d2] [Similarity score:             0.824961] What are the working hours of 319 office?
Monday to Friday from 14:00 to 18:00
> Top 2 nodes:
> [Node 25bd83ee-ee51-4ccd-992d-09d014848f22] [Similarity score:             0.824961] What are the working hours of 319 office?
Monday to Friday from 14:00 to 18:00
> [Node cb1b6a2f-b4e1-444c-9e32-9ee0267ce4d2] [Similarity score:             0.824961] What a

In [None]:
# query_engine = index.as_chat_engine(similarity_top_k=2, response_mode="compact", streaming=True)  # Modify retriever
# response = query_engine.chat("What did the author do growing up?")

In [21]:
response.print_response_stream()

I'm sorry, but I need more information to provide a response to your query. Can you please provide additional context or clarify what you are asking about?