# Implementation of a solution using Retrieval Augmented Generation with LLM based on LlamaIndex framework

### Data preparation

Flatten all text files into raw folder

In [112]:
import os
import shutil


def copy_text_files(source_dir, destination_dir):
    # Create the destination directory if it doesn't exist
    if not os.path.exists(destination_dir):
        os.makedirs(destination_dir)

    # Traverse the source directory
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            # Check if the file has a .txt extension (you can modify this condition)
            if file.endswith(".txt") or file.endswith(".md"):
                source_path = os.path.join(root, file)
                destination_path = os.path.join(destination_dir, 'raw_' + root.split('/')[-1] + '_' + file)

                # Copy the file to the destination directory
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file}")

# Replace 'source_directory' and 'destination_directory' with your actual paths
copy_text_files('../data/documents/', '../data/raw')
copy_text_files('../data/documents/edu-wiki', '../data/raw')

Copied: 11. Volunteering & Innopoints.md
Copied: 2. Accommodation & Meal Plan.md
Copied: 5. How to Order References?.md
Copied: 12. Migration Control for International Students.md
Copied: 9.5. Exchange Opportunities.md
Copied: 13. Library.md
Copied: 1. Academic Plan.md
Copied: 9. Rules of registration for the military service:.md
Copied: 8. Scholarships & Financial Support.md
Copied: 10. Your Health.md
Copied: Extracurricular.md
Copied: Admin.md
Copied: Academic.md
Copied: Household.md
Copied: apply.innopolis.university_master.txt
Copied: apply.innopolis.university_grant.txt
Copied: innopolis.university_sveden_apply.txt
Copied: apply.innopolis.university_olymp-math.txt
Copied: apply.innopolis.ru_get-in.txt
Copied: apply.innopolis.university_olympiad-bonus.txt
Copied: apply.innopolis.university_faq.txt
Copied: career.innopolis.university.txt
Copied: itproductdevelopment.md
Copied: venturecapitalhacksfromzerotonegotiatinganinvestmentdeal.tex.md
Copied: introductiontoitentrepreneurship.te

Turn on logging

In [1]:
import logging
import sys


logging.basicConfig(stream=sys.stdout, level=logging.WARNING)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## LlamaIndex

In [2]:
import chromadb
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, load_index_from_storage
from llama_index.storage.storage_context import StorageContext

### Load documents

In [3]:
docs_dir = "../data/raw"
documents = SimpleDirectoryReader(docs_dir).load_data()

### Create storage context (for ChromaDB)

In [4]:
persist_dir = "../storage"
db = chromadb.PersistentClient(path=persist_dir)

chroma_collection = db.get_or_create_collection("saira")

In [5]:
from llama_index.vector_stores import ChromaVectorStore


vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [6]:
storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
)

In [7]:
storage_context.persist(persist_dir=persist_dir)

### Create service context

In [5]:
from llama_index.llms import Ollama


# llm_model = "llama2:13b"
llm_model = "mistral"
# llm_model = "orca2:13b"
llm = Ollama(model=llm_model)

In [9]:
from llama_index.embeddings import HuggingFaceEmbedding


# emb_model = "intfloat/multilingual-e5-large"
# emb_model = "intfloat/multilingual-e5-base"
# emb_model = "intfloat/multilingual-e5-small"
# embed_model = HuggingFaceEmbedding(model_name=emb_model)

In [10]:
from llama_index.node_parser.file import SimpleFileNodeParser
# from llama_index.node_parser import SentenceSplitter


# node_parser = SimpleFileNodeParser()
# node_parser = SentenceSplitter(chunk_size=1024, chunk_overlap=20)

In [11]:
from llama_index import PromptHelper


# prompt_helper = PromptHelper(
    # context_window=4096,
    # num_output=256,
    # chunk_overlap_ratio=0.1,
    # chunk_size_limit=None,
# )

In [16]:
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model='local:BAAI/bge-large-en-v1.5',
    # embed_model=embed_model,
    # node_parser=node_parser,
    # prompt_helper=prompt_helper,
    chunk_size=1024,
    chunk_overlap=20
)

Downloading config.json:   0%|          | 0.00/779 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

### Create index

In [17]:
index = VectorStoreIndex.from_documents(
    documents,
    service_context=service_context,
    # storage_context=storage_context
)

In [26]:
# index_load = load_index_from_storage(storage_context, service_context=service_context)

### Create query engine (retriever)

In [18]:
query_engine = index.as_query_engine(similarity_top_k=10, response_mode="tree_summarize", streaming=False)

In [19]:
query_engine_2 = index.as_query_engine()

In [None]:
from llama_index import (
    VectorStoreIndex,
    get_response_synthesizer,
)
from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.postprocessor import KeywordNodePostprocessor

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)
node_postprocessors = [
    KeywordNodePostprocessor(
        required_keywords=["Combinator"], exclude_keywords=[
            "Based on the information provided", 
            "Without any prior knowledge",
            "Based on the information provided from the multiple sources",
            ""
            ]
    )
]

query_engine_post = RetrieverQueryEngine.from_args(
    retriever, node_postprocessors=node_postprocessors
)

## Retrieve

In [20]:
response = query_engine.query("How to create a student club?")
print(response.response)


To create a student club at Innopolis University, you can follow these steps:

1. Visit the campuslife.innopolis.ru/clubs website and click on the "Create my Club" button.
2. Fill out the necessary information for your club, such as the name, description, and purpose. You will also need to select a category for your club, such as sports, technology, or music.
3. Once you have created your club, you can start organizing activities and events for members. You can use resources provided by the university, such as audience and basic inventory/equipment, to help with your club's activities.
4. Your club can also participate in the Club League, where students earn points for various criteria, such as participation, activity, and success of events. The top 20 clubs at the end of each semester receive a budget, which they can use on relevant purchases or services. Additionally, leaders of the top 20 clubs can claim points for a Higher Scholarship every semester and innopoints in the end of ac

In [21]:
response = query_engine.query("What are Innopoints?")
print(response)

Innopolis University is an educational institution located in Innopolis, a city in the south of Russia. Innopolis is known for its advanced technological infrastructure and is often referred to as "Russia's Silicon Valley."

Innopolis University offers undergraduate and graduate programs in various fields such as computer science, engineering, business, and social sciences. The university also has a strong focus on research and innovation, with many collaborations and partnerships with international organizations and companies.

In Innopolis University, students are evaluated based on their academic performance, which is measured by Innopoints. Innopoints are awarded to students for their achievements in various areas such as assignments, exams, tests, and homeworks. The Innopoint system is designed to provide a fair and objective evaluation of students' performance, and it is used to determine the overall grade for each course.

Innopolis University also has a strict policy against ac

In [22]:
response = query_engine_2.query("What are innopoints?")
print(response)


Innopoints are a motivation and reward system for students who actively participate in extracurricular activities at IU. They are awarded for various contributions such as volunteering and managing student clubs. Innopoints can be exchanged for branded merchandise, canteen services, and accommodation in IU dorms. The Innopoints system is designed to reward students for their outstanding contribution to extracurricular life at IU.


In [23]:
response = query_engine_2.query("How to create a student club?")
print(response)

To create a student club at Innopolis University, you can follow these steps:

1. Visit the campuslife.innopolis.ru/clubs page and click on the "Create my Club" button.
2. Provide the necessary information about your club, such as its name, purpose, goals, and activities. You will also need to specify which faculty or department the club belongs to, and whether you want it to be open to all students or only those from specific faculties.
3. Once you have provided all the necessary information, submit your application. If your club is approved, you can start recruiting members and organizing activities.
4. For any questions or concerns, you can contact Evgeniia Dyuzhakova at @janedyuzha_dyuzha. She will be happy to help you with any aspect of creating a student club at Innopolis University.


# Evaluation

In [24]:
import os
import json

def generate_answer(question, qe):
    response = qe.query(question)
    return response.response


def add_generated_field(json_data, qe):
    for item in json_data:
        question = item['question']
        generated_answer = generate_answer(question, qe)
        item['generated_answer'] = generated_answer
    return json_data

def add_generated_field_to_directory(input_directory, output_directory, qe):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Iterate through each JSON file in the input directory
    for filename in os.listdir(input_directory):
        if filename.endswith(".json"):
            input_filepath = os.path.join(input_directory, filename)
            output_filepath = os.path.join(output_directory, filename)
            if os.path.isfile(output_filepath):
                continue

            # Load JSON data from the input file
            with open(input_filepath, 'r') as file:
                json_data = json.load(file)

            # Add the 'generated' field using the language model
            json_data_with_generated = add_generated_field(json_data, qe)

            # Save the updated JSON data to the output file
            with open(output_filepath, 'w') as output_file:
                json.dump(json_data_with_generated, output_file, indent=2)

In [25]:
input_directory = '../data/test'

In [18]:
output_directory = '../data/results/simple-llama2-13b-emb-base-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

In [19]:
output_directory = '../data/results/simple-llama2-13b-emb-base-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_2)

In [26]:
output_directory = '../data/results/mistral-instruct-7b-emb-large-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

In [27]:
output_directory = '../data/results/mistral-instruct-7b-emb-large-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_2)

In [28]:
output_directory = '../data/results/simple-orca2-13b-emb-base-summarize'
add_generated_field_to_directory(input_directory, output_directory, query_engine)

In [27]:
output_directory = '../data/results/simple-orca2-13b-emb-base-default'
add_generated_field_to_directory(input_directory, output_directory, query_engine_2)