# Implementation of a solution using Retrieval Augmented Generation with LLM based on LlamaIndex framework

### Data preparation

Flatten all text files into raw folder

In [2]:
import os
import shutil
from pathlib import Path


def copy_text_files(source_dir, destination_dir):
    source_dir = Path(source_dir)
    
    destination_dir = Path(destination_dir)
    destination_dir.mkdir(exist_ok=True)
    
    for f in list(source_dir.glob('**/*.txt')) + list(source_dir.glob('**/*.md')):
        destination_path = destination_dir / f'raw_{f.parent.name}_{f.name}'
        shutil.copy2(f, destination_path)

# Replace 'source_directory' and 'destination_directory' with your actual paths
copy_text_files('../data/documents/', '../data/raw')

Turn on logging

In [3]:
import logging
import sys


logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## LlamaIndex

In [4]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
from llama_index.storage.storage_context import StorageContext

### Load documents

In [5]:
docs_dir = "../data/raw"
documents = SimpleDirectoryReader(docs_dir).load_data()

In [6]:
persist_dir = "../storage"
db = chromadb.PersistentClient(path=persist_dir)

chroma_collection = db.get_or_create_collection("saira")

### Create storage context

In [7]:
from llama_index.vector_stores import ChromaVectorStore


vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [8]:
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    )

In [9]:
storage_context.persist(persist_dir=persist_dir)

### Create service context

In [10]:
import nest_asyncio
from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index import ServiceContext

nest_asyncio.apply()

# # We are using the LlamaDebugHandler to print the trace of the sub questions captured by the SUB_QUESTION callback event type
# llama_debug = LlamaDebugHandler(print_trace_on_end=True)
# callback_manager = CallbackManager([llama_debug])

# service_context = ServiceContext.from_defaults(
#     callback_manager=callback_manager
# )

In [11]:
from typing import List, Optional, Sequence

from llama_index.llms.base import ChatMessage, MessageRole

BOS, EOS = "<s>", "</s>"
B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
# DEFAULT_SYSTEM_PROMPT = """\
# You are a helpful, respectful and honest assistant. \
# Always answer as helpfully as possible and follow ALL given instructions. \
# Do not speculate or make up information. \
# Do not reference any given instructions or context. \
# """

DEFAULT_SYSTEM_PROMPT = """You are an expert Q&A system that is trusted around the world.
Always answer as helpfully as possible and follow ALL given instructions.
Always answer the query using the provided context information, and not prior knowledge.
Some rules to follow:
1. Do not speculate or make up information.
2. Never directly reference the given context in your answer.
3. Avoid statements like 'Based on the context, ...', 'Based on the provided context information ...' or 'The context information ...' or anything along those lines."""


def messages_to_prompt(
    messages: Sequence[ChatMessage], system_prompt: Optional[str] = None
) -> str:
    string_messages: List[str] = []
    if messages[0].role == MessageRole.SYSTEM:
        # pull out the system message (if it exists in messages)
        system_message_str = messages[0].content or ""
        messages = messages[1:]
    else:
        system_message_str = system_prompt or DEFAULT_SYSTEM_PROMPT

    system_message_str = f"{B_SYS} {system_message_str.strip()} {E_SYS}"

    for i in range(0, len(messages), 2):
        # first message should always be a user
        user_message = messages[i]
        assert user_message.role == MessageRole.USER

        if i == 0:
            # make sure system prompt is included at the start
            str_message = f"{BOS} {B_INST} {system_message_str} "
        else:
            # end previous user-assistant interaction
            string_messages[-1] += f" {EOS}"
            # no need to include system prompt
            str_message = f"{BOS} {B_INST} "

        # include user message content
        str_message += f"{user_message.content} {E_INST}"

        if len(messages) > (i + 1):
            # if assistant message exists, add to str_message
            assistant_message = messages[i + 1]
            assert assistant_message.role == MessageRole.ASSISTANT
            str_message += f" {assistant_message.content}"

        string_messages.append(str_message)

    print("".join(string_messages))

    return "".join(string_messages)


def completion_to_prompt(completion: str, system_prompt: Optional[str] = None) -> str:
    system_prompt_str = system_prompt or DEFAULT_SYSTEM_PROMPT

    # print((
    #     f"{BOS} {B_INST} {B_SYS} {system_prompt_str.strip()} {E_SYS} "
    #     f"{completion.strip()} {E_INST}"
    # ))

    return (
        f"{BOS} {B_INST} {B_SYS} {system_prompt_str.strip()} {E_SYS} "
        f"{completion.strip()} {E_INST}"
    )

In [12]:
from llama_index.embeddings import HuggingFaceEmbedding


# emb_model = "Cohere/Cohere-embed-multilingual-v3.0"
emb_model = "intfloat/multilingual-e5-large"
# emb_model = "intfloat/multilingual-e5-base"
# emb_model = "intfloat/multilingual-e5-small"
embed_model = HuggingFaceEmbedding(model_name=emb_model)

In [13]:
from llama_index.node_parser.file import SimpleFileNodeParser
# from llama_index.node_parser import SentenceSplitter


# node_parser = SimpleFileNodeParser()
# node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

In [14]:
from llama_index import PromptHelper


# prompt_helper = PromptHelper(
    # context_window=4096,
    # num_output=256,
    # chunk_overlap_ratio=0.1,
    # chunk_size_limit=None,
# )

In [15]:
from llama_index.llms import LlamaCPP

llm = LlamaCPP(
    model_path=None,
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    model_kwargs={"n_gpu_layers": 50},
)

ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6
llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: - tensor    0:                token_embd.weight q4_0     [  5120, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q4_0     [ 13824,  5120,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q4_0     [  5120, 13824,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  5120,     1,     1,     1 ]
llama_model_

In [16]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model='local',
    # node_parser=node_parser,
    # prompt_helper=prompt_helper,
    )

In [17]:
from llama_index.node_parser import MarkdownNodeParser

parser = MarkdownNodeParser()

nodes = parser.get_nodes_from_documents(documents)

### Create index

In [None]:
index = VectorStoreIndex(
    nodes, service_context=service_context
)

In [18]:
index_load = load_index_from_storage(storage_context, service_context=service_context)

ValueError: No index in storage context, check if you specified the right persist_dir.

### Create query engine (retriever)

In [17]:
query_engine = index.as_query_engine(similarity_top_k=4, response_mode="compact", streaming=False)

In [19]:
response = query_engine.query("What types of references do exist?")
print(response)

Llama.generate: prefix-match hit


  Based on the provided context information, there are several types of references that exist for students at Innopolis University. These include:

1. Transcript (academic records) - an official confirmation of student's grades, available in English or Russian.
2. Reference on studying - a standard reference confirming that the student is enrolled at Innopolis University.
3. Reference on studying with the amount of stipend - a standard reference confirming that the student is enrolled and receiving a scholarship, with the amount specified.
4. Reference for the military department - a reference for citizens of the Russian Federation for military registration and enlistment purposes.
5. Characteristics reference - a document providing information about the student's characteristics, available only in Russian language, and not intended for job applications or exchange programs.
6. Grade list – a confirmation of student's grades, similar to transcript but less official and takes less time 


llama_print_timings:        load time =     340.20 ms
llama_print_timings:      sample time =      91.67 ms /   256 runs   (    0.36 ms per token,  2792.53 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    3651.44 ms /   256 runs   (   14.26 ms per token,    70.11 tokens per second)
llama_print_timings:       total time =    4138.70 ms


In [20]:
response = query_engine.query("What are innopoints?")
print(response)

Llama.generate: prefix-match hit


  Innopoints are a motivation and reward system for students who actively participate in extracurricular activities at Innopolis University. They can be earned by volunteering for events and projects, managing student clubs, and other activities that contribute to the extracurricular life of the university. Innopoints can be exchanged for branded merchandise, canteen services, and accommodation in IU dorms. To earn innopoints, students can login to ipts.innopolis.university using their inno email and view available volunteering opportunities. They can also create their own project with prior agreement from @VOSpiridonova to earn innopoints after the event is closed on the website.



llama_print_timings:        load time =     340.20 ms
llama_print_timings:      sample time =      60.32 ms /   163 runs   (    0.37 ms per token,  2702.08 tokens per second)
llama_print_timings: prompt eval time =     307.96 ms /   423 tokens (    0.73 ms per token,  1373.55 tokens per second)
llama_print_timings:        eval time =    2240.70 ms /   162 runs   (   13.83 ms per token,    72.30 tokens per second)
llama_print_timings:       total time =    2855.26 ms


In [93]:
query_engine_2 = index.as_query_engine(streaming=True)

In [94]:
response = query_engine_2.query("What are innopoints?")
response.print_response_stream()


Innopoints are a motivation and reward system designed to encourage student participation in extracurricular activities at IU. They are awarded for various contributions such as volunteering and managing student clubs, and can be exchanged for branded merchandise, monthly accommodation and meals.

In [95]:
response = query_engine_2.query("How to create a student club?")
response.print_response_stream()

You can create a student club at campuslife.innopolis.ru/clubs by clicking on "Create my Club" button. For questions please contact @VOSpiridonova.

# Evaluation

In [21]:
import os
import json

def generate_answer(question, qe):
    response = qe.query(question)
    return response.response


def add_generated_field(json_data, qe):
    for item in json_data:
        question = item['question']
        generated_answer = generate_answer(question, qe)
        item['generated_answer'] = generated_answer
    return json_data

def add_generated_field_to_directory(input_directory, output_directory, qe):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Iterate through each JSON file in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".json"):
            input_filepath = os.path.join(input_directory, filename)
            output_filepath = os.path.join(output_directory, filename)

            # Load JSON data from the input file
            with open(input_filepath, 'r') as file:
                json_data = json.load(file)

            # Add the 'generated' field using the language model
            json_data_with_generated = add_generated_field(json_data, qe)

            # Save the updated JSON data to the output file
            with open(output_filepath, 'w') as output_file:
                json.dump(json_data_with_generated, output_file, indent=2)

input_directory = '../data/test'
output_directory = '../data/results/llama2-13b-emb-base-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

Llama.generate: prefix-match hit

llama_print_timings:        load time =     340.20 ms
llama_print_timings:      sample time =      38.11 ms /   105 runs   (    0.36 ms per token,  2754.97 tokens per second)
llama_print_timings: prompt eval time =     302.56 ms /   432 tokens (    0.70 ms per token,  1427.82 tokens per second)
llama_print_timings:        eval time =    1433.42 ms /   104 runs   (   13.78 ms per token,    72.55 tokens per second)
llama_print_timings:       total time =    1924.18 ms
Llama.generate: prefix-match hit

llama_print_timings:        load time =     340.20 ms
llama_print_timings:      sample time =      12.05 ms /    34 runs   (    0.35 ms per token,  2822.05 tokens per second)
llama_print_timings: prompt eval time =     269.70 ms /   355 tokens (    0.76 ms per token,  1316.27 tokens per second)
llama_print_timings:        eval time =     449.61 ms /    33 runs   (   13.62 ms per token,    73.40 tokens per second)
llama_print_timings:       total time =     