# Implementation of a solution using Retrieval Augmented Generation with LLM based on LlamaIndex framework

### Data preparation

Flatten all text files into raw folder

In [9]:
import os
import shutil


def copy_text_files(source_dir, destination_dir):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # Traverse the source directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            # Check if the file has a .txt extension (you can modify this condition)
            if file.endswith(".txt") or file.endswith(".md"):
                source_path = os.path.join(root, file)
                destination_path = os.path.join(destination_dir, 'raw_' + root.split('/')[-1] + '_' + file)

                # Copy the file to the destination directory
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file}")

# Replace 'source_directory' and 'destination_directory' with your actual paths
copy_text_files('../data/documents/', '../data/documents/raw')
copy_text_files('../data/documents/edu-wiki', '../data/documents/raw')

Copied: 11. Volunteering & Innopoints.md
Copied: 2. Accommodation & Meal Plan.md
Copied: 5. How to Order References?.md
Copied: 12. Migration Control for International Students.md
Copied: 9.5. Exchange Opportunities.md
Copied: 13. Library.md
Copied: 1. Academic Plan.md
Copied: 9. Rules of registration for the military service:.md
Copied: 8. Scholarships & Financial Support.md
Copied: 10. Your Health.md
Copied: Extracurricular.md
Copied: Admin.md
Copied: Academic.md
Copied: Household.md
Copied: apply.innopolis.university_master.txt
Copied: apply.innopolis.university_grant.txt
Copied: innopolis.university_sveden_apply.txt
Copied: apply.innopolis.university_olymp-math.txt
Copied: apply.innopolis.ru_get-in.txt
Copied: apply.innopolis.university_olympiad-bonus.txt
Copied: apply.innopolis.university_faq.txt
Copied: career.innopolis.university.txt
Copied: itproductdevelopment.md
Copied: venturecapitalhacksfromzerotonegotiatinganinvestmentdeal.tex.md
Copied: introductiontoitentrepreneurship.te

Turn on logging

In [15]:
import logging
import sys


logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## LlamaIndex

In [16]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
from llama_index.storage.storage_context import StorageContext

### Load documents

In [17]:
docs_dir = "../data/documents/raw"
documents = SimpleDirectoryReader(docs_dir).load_data()

In [18]:
persist_dir = "../storage"
db = chromadb.PersistentClient(path=persist_dir)

chroma_collection = db.get_or_create_collection("saira")

### Create storage context

In [19]:
from llama_index.vector_stores import ChromaVectorStore


vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [20]:
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
    )

In [21]:
storage_context.persist(persist_dir=persist_dir)

### Create service context

In [84]:
from llama_index.llms import Ollama


llm_model = "llama2:13b"
# llm_model = "mistral"
# llm_model = "yarn-mistral:7b-64k"
# llm_model = "orca2:13b"
llm = Ollama(model=llm_model)

In [72]:
from llama_index.embeddings import HuggingFaceEmbedding


# emb_model = "Cohere/Cohere-embed-multilingual-v3.0"
# emb_model = "intfloat/multilingual-e5-large"
emb_model = "intfloat/multilingual-e5-base"
emb_model = "intfloat/multilingual-e5-small"
embed_model = HuggingFaceEmbedding(model_name=emb_model)

In [8]:
from llama_index.node_parser.file import SimpleFileNodeParser
# from llama_index.node_parser import SentenceSplitter


# node_parser = SimpleFileNodeParser()
# node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

In [9]:
from llama_index import PromptHelper


# prompt_helper = PromptHelper(
    # context_window=4096,
    # num_output=256,
    # chunk_overlap_ratio=0.1,
    # chunk_size_limit=None,
# )

In [88]:
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model='local',
    # node_parser=node_parser,
    # prompt_helper=prompt_helper,
    chunk_size=1024,
    chunk_overlap=20
)

### Create index

In [89]:
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context, storage_context=storage_context
)

In [26]:
index_load = load_index_from_storage(storage_context, service_context=service_context)

### Create query engine (retriever)

In [90]:
query_engine = index.as_query_engine(similarity_top_k=10, response_mode="tree_summarize", streaming=False)

In [91]:
response = query_engine.query("What types of references do exist?")
print(response)

Based on the provided context information, there are two types of references that exist in academic writing: APA (American Psychological Association) and IEEE (Institute of Electrical and Electronics Engineers). These referencing styles have specific requirements for citing sources in a research paper. Additionally, there are resources available to help students avoid plagiarism, such as guidelines on paraphrasing, summarizing, quotations, and reference lists. The course covers these topics in detail, along with examples of exercises and tutorials to help students practice and improve their writing skills.


In [92]:
response = query_engine.query("What are innopoints?")
print(response)

Innopoints is a motivation and reward system designed to recognize students for their outstanding contribution to extracurricular activities at Innopolis University. Students can earn innopoints by volunteering, managing student clubs, and participating in other extracurricular events. These points can be exchanged for branded merchandise, monthly accommodation, and meals at the InnoStore. To volunteer and earn innopoints, students must log in to ipts.innopolis.university using their inno email, agree with @VOSpiridonova to create their own project, and close their event on the website after it is complete.


In [93]:
query_engine_2 = index.as_query_engine(streaming=True)

In [94]:
response = query_engine_2.query("What are innopoints?")
response.print_response_stream()


Innopoints are a motivation and reward system designed to encourage student participation in extracurricular activities at IU. They are awarded for various contributions such as volunteering and managing student clubs, and can be exchanged for branded merchandise, monthly accommodation and meals.

In [95]:
response = query_engine_2.query("How to create a student club?")
response.print_response_stream()

You can create a student club at campuslife.innopolis.ru/clubs by clicking on "Create my Club" button. For questions please contact @VOSpiridonova.

# Evaluation