# Implementation of a solution using Retrieval Augmented Generation with LLM based on LlamaIndex framework

### Data preparation

Flatten all text files into raw folder

In [112]:
import os
import shutil


def copy_text_files(source_dir, destination_dir):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # Traverse the source directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            # Check if the file has a .txt extension (you can modify this condition)
            if file.endswith(".txt") or file.endswith(".md"):
                source_path = os.path.join(root, file)
                destination_path = os.path.join(destination_dir, 'raw_' + root.split('/')[-1] + '_' + file)

                # Copy the file to the destination directory
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file}")

# Replace 'source_directory' and 'destination_directory' with your actual paths
copy_text_files('../data/documents/', '../data/raw')
copy_text_files('../data/documents/edu-wiki', '../data/raw')

Copied: 11. Volunteering & Innopoints.md
Copied: 2. Accommodation & Meal Plan.md
Copied: 5. How to Order References?.md
Copied: 12. Migration Control for International Students.md
Copied: 9.5. Exchange Opportunities.md
Copied: 13. Library.md
Copied: 1. Academic Plan.md
Copied: 9. Rules of registration for the military service:.md
Copied: 8. Scholarships & Financial Support.md
Copied: 10. Your Health.md
Copied: Extracurricular.md
Copied: Admin.md
Copied: Academic.md
Copied: Household.md
Copied: apply.innopolis.university_master.txt
Copied: apply.innopolis.university_grant.txt
Copied: innopolis.university_sveden_apply.txt
Copied: apply.innopolis.university_olymp-math.txt
Copied: apply.innopolis.ru_get-in.txt
Copied: apply.innopolis.university_olympiad-bonus.txt
Copied: apply.innopolis.university_faq.txt
Copied: career.innopolis.university.txt
Copied: itproductdevelopment.md
Copied: venturecapitalhacksfromzerotonegotiatinganinvestmentdeal.tex.md
Copied: introductiontoitentrepreneurship.te

Turn on logging

In [2]:
import logging
import sys


logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## LlamaIndex

In [1]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
from llama_index.storage.storage_context import StorageContext

### Load documents

In [3]:
docs_dir = "../data/raw"
documents = SimpleDirectoryReader(docs_dir).load_data()

In [4]:
persist_dir = "../storage"
db = chromadb.PersistentClient(path=persist_dir)

chroma_collection = db.get_or_create_collection("saira")

### Create storage context

In [5]:
from llama_index.vector_stores import ChromaVectorStore


vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [6]:
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
)

In [7]:
storage_context.persist(persist_dir=persist_dir)

### Create service context

In [13]:
from llama_index.llms import Ollama


# llm_model = "llama2:13b"
llm_model = "mistral"
# llm_model = "yarn-mistral:7b-64k"
# llm_model = "orca2:13b"
llm = Ollama(model=llm_model)

In [9]:
from llama_index.embeddings import HuggingFaceEmbedding


# emb_model = "intfloat/multilingual-e5-large"
# emb_model = "intfloat/multilingual-e5-base"
emb_model = "intfloat/multilingual-e5-small"
embed_model = HuggingFaceEmbedding(model_name=emb_model)

[2023-11-24 22:12:33,443] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [8]:
from llama_index.node_parser.file import SimpleFileNodeParser
# from llama_index.node_parser import SentenceSplitter


# node_parser = SimpleFileNodeParser()
# node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

In [9]:
from llama_index import PromptHelper


# prompt_helper = PromptHelper(
    # context_window=4096,
    # num_output=256,
    # chunk_overlap_ratio=0.1,
    # chunk_size_limit=None,
# )

In [15]:
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model='local',
    # embed_model=embed_model,
    # node_parser=node_parser,
    # prompt_helper=prompt_helper,
    chunk_size=1024,
    chunk_overlap=20
)

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


### Create index

In [12]:
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, storage_context=storage_context
)

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [682,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [682,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [682,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [682,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [682,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [682,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [682,

RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling cublasLtMatmul with transpose_mat1 1 transpose_mat2 0 m 1024 n 5140 k 1024 mat1_ld 1024 mat2_ld 1024 result_ld 1024 abcType 0 computeType 68 scaleType 0

In [26]:
index_load = load_index_from_storage(storage_context, service_context=service_context)

### Create query engine (retriever)

In [11]:
query_engine = index.as_query_engine(similarity_top_k=10, response_mode="tree_summarize", streaming=False)

In [35]:
query_engine_2 = index.as_query_engine()

In [None]:
from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import KeywordNodePostprocessor

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)
node_postprocessors = [
    KeywordNodePostprocessor(
        required_keywords=["Combinator"], exclude_keywords=[
            "Based on the information provided", 
            "Without any prior knowledge",
            "Based on the information provided from the multiple sources",
            ""
            ]
    )
]

query_engine = RetrieverQueryEngine.from_args(
    retriever, node_postprocessors=node_postprocessors
)

## Retrieve

In [25]:
response = query_engine.query("How to create a student club?")
print(response.response)

AttributeError: 'Response' object has no attribute 'text'

In [15]:
response = query_engine.query("What are Innopoints?")
print(response)

 Based on the information provided from multiple sources, there is no mention of "Innopoints" in any of the documents. Therefore, I cannot provide an answer to the query.


In [20]:
response = query_engine_2.query("What are innopoints?")
print(response)

 Based on the provided context, I'm afraid there is no mention of "innopoints" in any of the topics covered. The given context only discusses geometric shapes such as circles, ellipses, hyperbolas, parabolas, and vector equations related to these shapes. 

Therefore, it appears that "innopoints" is not a recognized term or concept in analytic geometry or linear algebra.

In [18]:
response = query_engine_2.query("How to create a student club?")
print(response)

  Sure! Based on the given context information, here is the answer to the query "How to create a student club?":

To create a student club, you can click on the "Create my Club" button at campuslife.innopolis.ru/clubs. For questions or assistance, you can contact Evgeniia Dyuzhakova, the Student Clubs Advisor, by writing to her via email or social media (handle: @janedyuzha_dyuzha).

Please note that creating a student club provides numerous benefits, such as self-awareness, soft skill development, networking opportunities, practical experience, and the ability to engage with diverse groups of people. Additionally, joining or starting a student club can be an enjoyable experience that contributes to university development and enhances your CV.

# Evaluation

In [31]:
import os
import json

def generate_answer(question, qe):
    response = qe.query(question)
    return response.response


def add_generated_field(json_data, qe):
    for item in json_data:
        question = item['question']
        generated_answer = generate_answer(question, qe)
        item['generated_answer'] = generated_answer
    return json_data

def add_generated_field_to_directory(input_directory, output_directory, qe):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Iterate through each JSON file in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".json"):
            input_filepath = os.path.join(input_directory, filename)
            output_filepath = os.path.join(output_directory, filename)

            # Load JSON data from the input file
            with open(input_filepath, 'r') as file:
                json_data = json.load(file)

            # Add the 'generated' field using the language model
            json_data_with_generated = add_generated_field(json_data, qe)

            # Save the updated JSON data to the output file
            with open(output_filepath, 'w') as output_file:
                json.dump(json_data_with_generated, output_file, indent=2)

In [32]:
input_directory = '../data/test'

In [33]:
output_directory = '../data/results/llama2-13b-emb-base-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

In [36]:
output_directory = '../data/results/llama2-13b-emb-base-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_2)