# Build the Vector DB

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install accelerate langchain langchain_community unstructured sentence-transformers chromadb gradio

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
CHROMA_PATH = "./chroma"
DATA_PATH = "./datasets"

embeding_model_name = "sentence-transformers/LaBSE"
embeding_model_kwargs = {'device': device}
embeding_encode_kwargs = {'normalize_embeddings': False}

embedding_function = HuggingFaceEmbeddings(model_name=embeding_model_name,model_kwargs=embeding_model_kwargs,encode_kwargs=embeding_encode_kwargs)

In [None]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)


def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.txt")
    documents = loader.load()
    return documents


def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    return chunks


def save_to_chroma(chunks: list[Document]):
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    db = Chroma.from_documents(
        chunks,
        embedding_function,
        persist_directory=CHROMA_PATH
    )

    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")

In [None]:
generate_data_store()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# backup the sqlite database

# Load the Vector DB

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# vector database for business law
!cp -r /content/drive/MyDrive/IRI_LAW/chroma ./chroma

In [None]:
CHROMA_PATH = "./chroma"

embeding_model_name = "sentence-transformers/LaBSE"
embeding_model_kwargs = {'device': device}
embeding_encode_kwargs = {'normalize_embeddings': False}

embedding_function = HuggingFaceEmbeddings(model_name=embeding_model_name,model_kwargs=embeding_model_kwargs,encode_kwargs=embeding_encode_kwargs)
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [None]:
def find_relevant_results(query, k=3):
  results = db.similarity_search_with_relevance_scores(query, k=k)

  return results

# Run the LLM Model

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model_name_or_id = "MaralGPT/Maral-7B-alpha-1"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_id)

tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name_or_id, torch_dtype=torch.bfloat16, device_map="auto")

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



In [None]:
generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.5,
    max_new_tokens=300,
    pad_token_id=tokenizer.eos_token_id
)

In [None]:
PROMPT_TEMPLATE = """
فقط بر اساس متن زیر به سوال پاسخ دهید:

{context}

---

با توجه به متن بالا به سوال پاسخ دهید:
{question}

Answer:
"""

In [None]:
from langchain.prompts import ChatPromptTemplate
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)

In [None]:
def process_query(query):
    results = find_relevant_results(query)
    if len(results) == 0 or results[0][1] < 0.25:
        return ("اطلاعاتی که مرتبط با سوال شما باشد را در پایگاه دانش خود پیدا نکردم!")
    context = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt = prompt_template.format(context=context, question=query)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, generation_config=generation_config)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Serve in Gradio

In [None]:
import gradio as gr

def run_query(query):
    return process_query(query)

with gr.Blocks() as demo:
    query = gr.Textbox(label="Question", rtl=True, lines=5)
    output = gr.Textbox(label="Answer", rtl=True, lines=10)
    greet_btn = gr.Button("Query")
    greet_btn.click(fn=run_query, inputs=query, outputs=output, api_name="run_query")

demo.launch();

# Evaluation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp -r /content/drive/MyDrive/IRI_LAW/evaluation ./evaluation

In [None]:
# TODO evaluation on bussiness law