# Implementation of a solution using Retrieval Augmented Generation with LLM based on LlamaIndex framework

### Data preparation

Flatten all text files into raw folder

In [1]:
import os
import shutil


def copy_text_files(source_dir, destination_dir):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # Traverse the source directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            # Check if the file has a .txt extension (you can modify this condition)
            if file.endswith(".txt") or file.endswith(".md"):
                source_path = os.path.join(root, file)
                destination_path = os.path.join(destination_dir, 'raw_' + root.split('/')[-1] + '_' + file)

                # Copy the file to the destination directory
                shutil.copy2(source_path, destination_path)
                # print(f"Copied: {file}")

# Replace 'source_directory' and 'destination_directory' with your actual paths
copy_text_files('../data/documents/', '../data/raw')
copy_text_files('../data/documents/edu-wiki', '../data/raw')

Turn on logging

In [1]:
import logging
import sys


logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## LlamaIndex

In [2]:
import chromadb
import os
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
from llama_index.vector_stores import ChromaVectorStore, SimpleVectorStore
from pathlib import Path

### Load documents

In [3]:
docs_dir = "../data/raw"
documents = SimpleDirectoryReader(docs_dir).load_data()

### Create service context

In [4]:
from llama_index.llms import Ollama


# llm_model = "llama2:13b"
llm_model = "mistral"
# llm_model = "orca2:13b"
# llm_model = "vicuna:13b-16k"
llm = Ollama(model=llm_model)

In [5]:
from llama_index.node_parser.file import SimpleFileNodeParser
# from llama_index.node_parser import SentenceSplitter


# node_parser = SimpleFileNodeParser()
# node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

In [6]:
from llama_index import PromptHelper


# prompt_helper = PromptHelper(
    # context_window=4096,
    # num_output=256,
    # chunk_overlap_ratio=0.1,
    # chunk_size_limit=None,
# )

In [5]:
from llama_index.embeddings import HuggingFaceEmbedding


embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-large-en-v1.5")  # Best model for retrieval according to MTEB

[2023-11-27 16:06:59,648] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [6]:
service_context = ServiceContext.from_defaults(
    llm=llm,
    # embed_model='local:BAAI/bge-large-en-v1.5',  # Best model for retrieval according to MTEB
    # embed_model='local',
    embed_model=embed_model,
    # node_parser=node_parser,
    # prompt_helper=prompt_helper,
    chunk_size=1024,
    chunk_overlap=20
)



### Create storage context (for ChromaDB)

In [40]:
persist_dir = "../storage/chromadb"
persist_dir = os.path.abspath(persist_dir)

db = chromadb.PersistentClient(path=persist_dir, settings=chromadb.Settings(allow_reset=True))
db

<chromadb.api.client.Client at 0x7f59742ae860>

In [41]:
db.reset()

True

In [30]:
huggingface_ef = chromadb.utils.embedding_functions.HuggingFaceEmbeddingFunction(
    api_key="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",
    model_name="BAAI/bge-large-en-v1.5"
)

In [42]:
chroma_collection = db.get_or_create_collection("saira")
chroma_collection.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None}

In [43]:
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
vector_store.to_dict()

{'stores_text': True,
 'is_embedding_query': True,
 'flat_metadata': True,
 'collection_name': None,
 'host': None,
 'port': None,
 'ssl': False,
 'headers': None,
 'persist_dir': None,
 'collection_kwargs': {},
 'class_name': 'ChromaVectorStore'}

In [44]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)
storage_context.vector_stores

{'default': ChromaVectorStore(stores_text=True, is_embedding_query=True, flat_metadata=True, collection_name=None, host=None, port=None, ssl=False, headers=None, persist_dir=None, collection_kwargs={})}

In [45]:
index = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context,
    storage_context=storage_context
)

In [46]:
index

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x7f5ad85d6a40>

In [47]:
index.storage_context.persist(persist_dir=persist_dir)

In [54]:
index = VectorStoreIndex.from_vector_store(
    vector_store,
    service_context=service_context,
)

In [13]:
if os.path.exists(persist_dir + '/index_store.json'):  # Load
    print('Loading index')
    index = VectorStoreIndex.from_vector_store(
        vector_store,
        service_context=service_context,
    )
else:  # Create
    print('Creating index')
    index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context,
        storage_context=storage_context
    )
    index.storage_context.persist(persist_dir=persist_dir)

Creating index


### Create storage context (SimpleVectorStore) and (or load) index

In [56]:
persist_dir = "../storage/simple/"
persist_dir = str(Path(persist_dir).resolve())

if os.path.exists(persist_dir + '/index_store.json'):  # Load
    old_name = '/default__vector_store.json'
    new_name = '/vector_store.json'
    if os.path.exists(persist_dir + old_name):
        os.rename(persist_dir + old_name, persist_dir + new_name)
        print(f"File renamed from '{old_name}' to '{new_name}'.")
    print('Loading storage context')
    storage_context = StorageContext.from_defaults(
        vector_store=SimpleVectorStore.from_persist_dir(persist_dir=persist_dir),
        persist_dir=persist_dir,
    )
    print('Loading index')
    index = load_index_from_storage(storage_context, service_context=service_context)
else:  # Create
    storage_context = StorageContext.from_defaults(
        vector_store=SimpleVectorStore(),
    )
    print('Creating index')
    index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context,
        storage_context=storage_context
    )
    print('Persisting index')
    storage_context.persist(persist_dir=persist_dir)
    index.storage_context.persist(persist_dir=persist_dir)

Loading storage context
Loading index


### Create storage context (FAISS) and index

In [23]:
import faiss
from llama_index.vector_stores.faiss import FaissVectorStore


d = 1024  # BAAI/bge-large-en-v1.5 embedding size
faiss_index = faiss.IndexFlatL2(d)

persist_dir = "../storage/faiss/"
persist_dir = str(Path(persist_dir).resolve())

if os.path.exists(persist_dir + '/index_store.json'):  # Load
    old_name = '/default__vector_store.json'
    new_name = '/vector_store.json'
    if os.path.exists(persist_dir + old_name):
        os.rename(persist_dir + old_name, persist_dir + new_name)
        print(f"File renamed from '{old_name}' to '{new_name}'.")
    print('Loading storage context')
    vector_store = FaissVectorStore.from_persist_dir(persist_dir)
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store, persist_dir=persist_dir
    )
    print('Loading index')
    index = load_index_from_storage(storage_context, service_context=service_context)
else:  # Create
    vector_store = FaissVectorStore(faiss_index=faiss_index)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    print('Creating index')
    index = VectorStoreIndex.from_documents(
        documents,
        service_context=service_context,
        storage_context=storage_context
    )
    print('Persisting index')
    index.storage_context.persist(persist_dir=persist_dir)

File renamed from '/default__vector_store.json' to '/vector_store.json'.
Loading storage context
Loading index


### Create query engine (retriever)

In [24]:
query_engine_default = index.as_query_engine()

In [15]:
query_engine_sum = index.as_query_engine(similarity_top_k=10, response_mode="tree_summarize", streaming=False)

In [None]:
from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import KeywordNodePostprocessor

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)
node_postprocessors = [
    KeywordNodePostprocessor(
        required_keywords=["Combinator"], exclude_keywords=[
            "Based on the information provided", 
            "Without any prior knowledge",
            "Based on the information provided from the multiple sources",
            ""
            ]
    )
]

query_engine_post = RetrieverQueryEngine.from_args(
    retriever, node_postprocessors=node_postprocessors
)

## Retrieve

In [27]:
response = query_engine_default.query("How to create a student club?")
print(response.response)

To create a student club, you can follow these steps:

1. Visit the campuslife.innopolis.ru/clubs page.
2. Click on the "Create my Club" button.
3. Fill out the necessary information about your club, such as its name, purpose, and activities.
4. Provide any additional details or requirements for membership.
5. Submit the form to create your student club.

For questions or further assistance, you can contact Evgeniia Dyuzhakova at @janedyuzha_dyuzha.


In [13]:
response = query_engine_default.query("What are Innopoints?")
print(response)

Innopoints are a motivation and reward system for students who actively participate in extracurricular activities at IU. They can be earned through volunteering and managing student clubs, among other activities. Innopoints can be exchanged for various rewards such as branded merchandise, monthly accommodation, and meals. The InnoStore is a platform where students can redeem their innopoints for these rewards.


In [14]:
response = query_engine_default.query("Who is dean and how to contact their?")
print(response)

The provided context information does not mention anything about a dean or their contact information.


In [16]:
response = query_engine_sum.query("What are innopoints?")
print(response)


It appears that "innopoints" may refer to points assigned to students as part of an educational system or program at Innopolis University. However, without more context, it is difficult to provide a specific definition or explanation for what these points represent or how they are awarded. If you could provide more information about the system in question, I would be happy to try and help answer your query.


In [17]:
response = query_engine_sum.query("How to create a student club?")
print(response)

To create a student club at Innopolis University, you can follow these steps:
1. Go to the campuslife.innopolis.ru/clubs website.
2. Click on the "Create my Club" button.
3. Fill out the necessary information for your club, such as its name, description, and purpose.
4. Submit your application and wait for approval from the university administration.
5. Once approved, you can start organizing activities and events for your club.
6. To promote your club, you can use various resources provided by the university, such as audience, basic inventory/equipment, and promotion support.
7. You can also participate in Club League, where students earn points for various criteria. The top 20 clubs at the end of each semester receive a budget that they can spend on relevant purchases or services, and the leaders of these clubs can claim points for a Higher Scholarship every semester and innopoints in the end of academic year.
8. Joining a student club will give you the opportunity to learn more abou

# Evaluation

In [18]:
import os
import json

def generate_answer(question, qe):
    response = qe.query(question)
    return response.response


def add_generated_field(json_data, qe):
    for item in json_data:
        question = item['question']
        generated_answer = generate_answer(question, qe)
        item['generated_answer'] = generated_answer
    return json_data

def add_generated_field_to_directory(input_directory, output_directory, qe):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Iterate through each JSON file in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".json"):
            input_filepath = os.path.join(input_directory, filename)
            output_filepath = os.path.join(output_directory, filename)
            if os.path.isfile(output_filepath):
                continue

            # Load JSON data from the input file
            with open(input_filepath, 'r') as file:
                json_data = json.load(file)

            # Add the 'generated' field using the language model
            json_data_with_generated = add_generated_field(json_data, qe)

            # Save the updated JSON data to the output file
            with open(output_filepath, 'w') as output_file:
                json.dump(json_data_with_generated, output_file, indent=2)

In [19]:
input_directory = '../data/test'

In [20]:
output_directory = '../data/results/faiss-mistral-7b-emb-large-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_default)

In [21]:
output_directory = '../data/results/faiss-mistral-7b-emb-large-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine_sum)

In [25]:
output_directory = '../data/results/simple-vicuna-13b-16k-emb-large-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_default)

In [26]:
output_directory = '../data/results/simple-vicuna-13b-16k-emb-large-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine_sum)

In [26]:
output_directory = '../data/results/simple-llama2-13b-emb-large-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

In [None]:
output_directory = '../data/results/simple-llama2-13b-emb-large-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_2)

In [26]:
output_directory = '../data/results/mistral-instruct-7b-emb-large-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

In [27]:
output_directory = '../data/results/mistral-instruct-7b-emb-large-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_2)

In [19]:
output_directory = '../data/results/simple-orca2-13b-emb-large-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

In [20]:
output_directory = '../data/results/simple-orca2-13b-emb-large-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_2)