In [21]:
import os
import textwrap
import time
from llm import GeminiLLM
from typing import List
from dotenv import load_dotenv
import re

class ArabicTextCorrector:
    """
    A class that provides functionality to correct Arabic text in chunks using an LLM model. 
    It handles reading, processing, splitting, and merging text files while allowing for corrections
    with retry logic in case of errors during the correction process.
    """

    def __init__(self, model: GeminiLLM):
        """
        Initializes the ArabicTextCorrector instance with a given model.

        Args:
            model (GeminiLLM): An instance of the GeminiLLM model used for text corrections.
        """
        self.model = model

    def configure(self):
        """Configures the model for use. (Stub for potential future use)"""
        pass

    def read_text_from_file(self, file_path: str) -> str:
        """
        Reads text from a file and returns it.

        Args:
            file_path (str): The path to the file to read from.

        Returns:
            str: The content of the file.
        """
        with open(file_path, 'r', encoding='utf-8') as file:
            return file.read()

    def save_text_to_file(self, text: str, file_path: str):
        """
        Saves a string of text to a file, ensuring the directory exists.

        Args:
            text (str): The text to save.
            file_path (str): The path to the file to save to.
        """
        os.makedirs(os.path.dirname(file_path), exist_ok=True)
        with open(file_path, 'w', encoding='utf-8') as file:
            file.write(text)

    def split_text(self, text: str, chunk_size: int) -> List[str]:
        """
        Splits the text into chunks of a specified size.

        Args:
            text (str): The text to be split.
            chunk_size (int): The size of each chunk.

        Returns:
            List[str]: A list of text chunks.
        """
        return textwrap.wrap(text, chunk_size)

    def generate_correction_prompt(self, text: str, custom_prompt: str = None) -> str:
        """
        Generates a correction prompt for the model to process the text.

        Args:
            text (str): The text to be corrected.
            custom_prompt (str): An optional custom prompt to be used for correction.

        Returns:
            str: A prompt that will be passed to the model for text correction.
        """
        if custom_prompt:
            return f"{custom_prompt}\n{text}"
        return f"يرجى إعادة كتابة النص التالي بشكل صحيح دون أي ملاحظات أو توضيحات:\n{text}"

    def correct_text(self, text: str, custom_prompt: str = None, max_retries: int = 3) -> str:
        """
        Corrects the provided text using the model, with retry logic in case of failure.

        Args:
            text (str): The text to be corrected.
            custom_prompt (str): An optional custom prompt for the correction.
            max_retries (int): The maximum number of retry attempts in case of failure.

        Returns:
            str: The corrected text or an error message after the maximum retries.
        """
        prompt = self.generate_correction_prompt(text, custom_prompt)
        if not self.model:
            raise ValueError("Model is not configured. Please call 'configure' first.")
        attempts = 0
        while attempts < max_retries:
            try:
                response = self.model.generate_content(prompt)
                return response
            except Exception as e:
                attempts += 1
                print(f"Error occurred while generating content (attempt {attempts}/{max_retries}): {e}")
                if attempts == max_retries:
                    return f"\n\n[تنبيه: لم يتم معالجة هذا الجزء بسبب خطأ بعد {max_retries} محاولات]\n\n{text}"

    def process_file(self, input_file: str, output_file: str, output_folder: str, chunk_size: int = 1000, custom_prompt: str = None):
        """
        Processes the input file, splits it into chunks, corrects each chunk, and saves the output to a folder.

        Args:
            input_file (str): The path to the input text file.
            output_file (str): The path to the final output file.
            output_folder (str): The folder to save the corrected chunks.
            chunk_size (int): The size of each chunk in characters.
            custom_prompt (str): An optional custom prompt for text correction.
        """
        start_time = time.time()
        os.makedirs(output_folder, exist_ok=True)
        text = self.read_text_from_file(input_file)
        chunks = self.split_text(text, chunk_size)

        corrected_text = ""
        for i, chunk in enumerate(chunks):
            corrected_chunk = self.correct_text(chunk, custom_prompt)
            corrected_text += corrected_chunk
            output_path = os.path.join(output_folder, f'corrected_chunk_{i}.txt')
            self.save_text_to_file(corrected_chunk, output_path)
            print(f"Processed chunk {i + 1} of {len(chunks)}")

        end_time = time.time()
        runtime = end_time - start_time
        log_message = f"{os.path.basename(__file__)} --> time taken: {runtime:.2f} seconds --> start of the run: {time.ctime(start_time)}\n"
        self.log_runtime(log_message)

    def save_chunks_without_processing(self, input_file: str, chunk_numbers: List[int], output_folder: str, chunk_size: int = 1000):
        """
        Saves specified chunks of the input file without processing.

        Args:
            input_file (str): The path to the input text file.
            chunk_numbers (List[int]): A list of chunk numbers to save.
            output_folder (str): The folder to save the chunks.
            chunk_size (int): The size of each chunk in characters.
        """
        os.makedirs(output_folder, exist_ok=True)
        text = self.read_text_from_file(input_file)
        chunks = self.split_text(text, chunk_size)

        for index in chunk_numbers:
            if index < 1 or index > len(chunks):
                print(f"Chunk number {index} is out of range.")
                continue
            chunk = chunks[index - 1]
            output_path = os.path.join(output_folder, f'corrected_chunk_{index}.txt')
            self.save_text_to_file(chunk, output_path)
            print(f"Saved chunk {index} of {len(chunks)}")

    def log_runtime(self, log_message: str):
        """
        Logs the runtime information into a file.

        Args:
            log_message (str): The message to be logged.
        """
        with open("runtime.log", 'a', encoding='utf-8') as log_file:
            log_file.write(log_message)

    def merge_chunks(self, input_folder: str, output_file: str):
        """
        Merges all chunks in the input folder into a single file, sorted by chunk number.

        Args:
            input_folder (str): The folder containing chunk files.
            output_file (str): The file to save the merged output.
        """
        files = os.listdir(input_folder)
        sorted_files = sorted(files, key=lambda x: int(re.search(r'corrected_chunk_(\d+)\.txt', x).group(1)))

        with open(output_file, 'w', encoding='utf-8') as output:
            for file in sorted_files:
                with open(os.path.join(input_folder, file), 'r', encoding='utf-8') as input_file:
                    print(f"Merging {file}")
                    output.write(input_file.read())
                    output.write("\n\n")


# Arabic Text Corrector

This project contains a Python class, `ArabicTextCorrector`, that leverages a Language Model (LLM) to correct Arabic text in chunks. It processes input text files, splits them into chunks, corrects the text, and saves the results into multiple files. Additionally, it can merge the chunks into a single file in the correct order.

## Features

- Read text from files.
- Split text into chunks of a specified size.
- Correct Arabic text using a Language Model with retry logic in case of failures.
- Save corrected text chunks to separate files.
- Merge chunk files into a single file in the correct order.
- Log runtime information for the text correction process.
- Save specific chunks without processing.

## Requirements

- Python 3.7 or higher
- The following Python packages:
  - `llm` (for the GeminiLLM model)
  - `python-dotenv` (for managing environment variables)

### Installing Requirements

#### Install all dependencies at once:

To install all required packages at once, use the following command:

```bash
pip install -r requirements.txt


In [22]:
load_dotenv()  # Load environment variables
gemini_api_key = os.getenv("GEMINI_API_KEY")
# Instantiate and configure GeminiLLM
model = GeminiLLM(model_name='gemini-1.0-pro-latest')
model.configure(api_key=gemini_api_key)
corrector = ArabicTextCorrector(model)
corrector.configure()

In [None]:
# Processing the file with custom prompt and logging runtime
corrector.process_file(
    input_file="taw_hist.txt", 
    output_file="corrected_hist.txt", 
    output_folder="correct_hist_retry",
    chunk_size=1000
    )


In [None]:
# Saving specific chunks without processing

chunk_numbers = [10,94,95]

corrector.save_chunks_without_processing(input_file="taw_hist.txt", chunk_numbers=chunk_numbers, output_folder="correct_hist", chunk_size=1000)

In [23]:
#TODO: MERGE THE FILE AFTER MANUALLY EDITING UN-USABLE CHUNKS for both bio and hist

##manual fix is done so now we can merge the files

corrector.merge_chunks(input_folder="correct_bio", output_file="corrected_bio.txt")

Merging corrected_chunk_0.txt
Merging corrected_chunk_1.txt
Merging corrected_chunk_2.txt
Merging corrected_chunk_3.txt
Merging corrected_chunk_4.txt
Merging corrected_chunk_5.txt
Merging corrected_chunk_6.txt
Merging corrected_chunk_7.txt
Merging corrected_chunk_8.txt
Merging corrected_chunk_9.txt
Merging corrected_chunk_10.txt
Merging corrected_chunk_11.txt
Merging corrected_chunk_12.txt
Merging corrected_chunk_13.txt
Merging corrected_chunk_14.txt
Merging corrected_chunk_15.txt
Merging corrected_chunk_16.txt
Merging corrected_chunk_17.txt
Merging corrected_chunk_18.txt
Merging corrected_chunk_19.txt
Merging corrected_chunk_20.txt
Merging corrected_chunk_21.txt
Merging corrected_chunk_22.txt
Merging corrected_chunk_23.txt
Merging corrected_chunk_24.txt
Merging corrected_chunk_25.txt
Merging corrected_chunk_26.txt
Merging corrected_chunk_27.txt
Merging corrected_chunk_28.txt
Merging corrected_chunk_29.txt
Merging corrected_chunk_30.txt
Merging corrected_chunk_31.txt
Merging corrected_

In [28]:
import os
from typing import List, Dict, Optional
import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter, NLTKTextSplitter

# Define a base TextSplitter class
class TextSplitter:
    """
    Base class for splitting text into chunks.
    Subclasses should implement the split_text method.
    """
    def split_text(self, text: str) -> List[str]:
        raise NotImplementedError("Subclasses should implement this method.")

# Implement specific text splitters
class RecursiveCharacterTextSplitterAdapter(TextSplitter):
    """
    Adapter for splitting text using the RecursiveCharacterTextSplitter from Langchain.
    """
    def __init__(self, chunk_size: int = 100, chunk_overlap: int = 20):
        """
        Initializes the RecursiveCharacterTextSplitterAdapter with the specified chunk size and overlap.
        
        Args:
            chunk_size (int): Maximum size of each chunk.
            chunk_overlap (int): Number of characters to overlap between chunks.
        """
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=True
        )

    def split_text(self, text: str) -> List[str]:
        """
        Splits the given text into chunks using the RecursiveCharacterTextSplitter.

        Args:
            text (str): Text to split.

        Returns:
            List[str]: List of text chunks.
        """
        return self.splitter.split_text(text)

class NLTKTextSplitterAdapter(TextSplitter):
    """
    Adapter for splitting text using the NLTKTextSplitter from Langchain.
    """
    def __init__(self):
        """
        Initializes the NLTKTextSplitterAdapter.
        """
        self.splitter = NLTKTextSplitter()

    def split_text(self, text: str) -> List[str]:
        """
        Splits the given text into chunks using the NLTKTextSplitter.

        Args:
            text (str): Text to split.

        Returns:
            List[str]: List of text chunks.
        """
        return self.splitter.split_text(text)

# Define a custom embedding function
class CustomSentenceTransformerEmbedding(embedding_functions.EmbeddingFunction):
    """
    Custom embedding function that generates embeddings using the SentenceTransformer model.
    """
    def __init__(self, model_name: str = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"):
        """
        Initializes the CustomSentenceTransformerEmbedding with the specified model.

        Args:
            model_name (str): Name of the SentenceTransformer model to use.
        """
        self.model = SentenceTransformer(model_name)

    def __call__(self, texts: List[str]) -> List[List[float]]:
        """
        Generates embeddings for a list of texts.

        Args:
            texts (List[str]): List of input texts.

        Returns:
            List[List[float]]: List of embeddings for each input text.
        """
        embeddings = self.model.encode(texts)
        return embeddings.tolist()

# Define the ChromaInterface class
class ChromaInterface:
    """
    Interface for interacting with ChromaDB to store and query document embeddings.
    """
    def __init__(self, collection_name: str, persist_directory: str, text_splitter: TextSplitter):
        """
        Initializes the ChromaInterface with a persistent ChromaDB collection and a text splitter.

        Args:
            collection_name (str): Name of the collection in ChromaDB.
            persist_directory (str): Directory where the ChromaDB data will be stored.
            text_splitter (TextSplitter): Text splitter used to divide documents into chunks.
        """
        self.client = chromadb.PersistentClient(path=persist_directory)
        self.embedding_function = CustomSentenceTransformerEmbedding()
        self.collection = self.client.get_or_create_collection(
            name=collection_name,
            embedding_function=self.embedding_function
        )
        self.text_splitter = text_splitter

    def add_documents_from_files(self, file_paths: List[str], metadatas: Optional[List[Dict[str, str]]] = None):
        """
        Adds documents from the specified files into the ChromaDB collection after splitting them into chunks.
        
        Args:
            file_paths (List[str]): List of file paths containing the documents to add.
            metadatas (Optional[List[Dict[str, str]]]): List of metadata dictionaries corresponding to each file.
        """
        documents = []
        ids = []
        id_counter = 0  # Initialize a counter for unique IDs

        # Process each file
        for file_path in file_paths:
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                # Split the content into chunks
                split_texts = self.text_splitter.split_text(content)
                documents.extend(split_texts)
                
                # Generate unique IDs for each chunk
                for _ in split_texts:
                    ids.append(f"{os.path.basename(file_path)}_{id_counter}")
                    id_counter += 1  # Increment the counter for each chunk

        if metadatas is None:
            # If no metadata is provided, use the file path as the source metadata
            metadatas = [{"source": file_path} for file_path in file_paths]

        # Adjust metadatas to match the number of document chunks
        extended_metadatas = []
        for i, file_path in enumerate(file_paths):
            metadata = metadatas[i] if metadatas and i < len(metadatas) else {"source": file_path}
            # Duplicate the metadata for all chunks of the same document
            extended_metadatas.extend([metadata] * len(self.text_splitter.split_text(open(file_path, 'r', encoding='utf-8').read())))

        # Add documents and their metadata into the collection
        self.collection.add(
            documents=documents,
            metadatas=extended_metadatas,
            ids=ids
        )

    def query(self, query_text: str, n_results: int = 30) -> List[str]:
        """
        Queries the ChromaDB collection for the most relevant documents based on the query text.
        
        Args:
            query_text (str): The text query for searching relevant documents.
            n_results (int): The number of results to return (default is 30).

        Returns:
            List[str]: List of relevant document chunks.
        """
        results = self.collection.query(
            query_texts=[query_text],
            n_results=n_results
        )['documents']
        return results


In [26]:
# Initialize the interface
text_splitter = RecursiveCharacterTextSplitterAdapter(chunk_size=200, chunk_overlap=20)

chroma_interface = ChromaInterface("taw_bio",
                                   "DB/chroma_db",
                                       text_splitter=text_splitter
)


In [27]:
chroma_interface.client.list_collections()

[Collection(id=2d40b393-43fa-4837-a034-711c890415aa, name=taw_hist),
 Collection(id=ebf733d0-abe7-4952-8c42-9ee3538119d8, name=taw_bio)]

In [29]:
#generate the embeddings for the bio chunks

start = time.time()
chroma_interface.add_documents_from_files(
    file_paths=["corrected_bio.txt"],
    metadatas=[{"source": "corrected_bio.txt"}]
)
end = time.time()
print(f"Time taken to add documents: {end - start:.2f} seconds")


Time taken to add documents: 47.45 seconds


In [31]:
#query the bio chunks
query_text = "ما هي الميتوكوندريا؟"
results = chroma_interface.query(query_text, n_results=5)
for i, result in enumerate(results):
    print(f"Result {i + 1}:\n{result}\n")

Result 1:
['ويُعرف هذا النوع بالتخمر.', '**الأمراض الحساسية:**', '**الدورة الاندماجية:**\n\nتُعرف الدورة الاندماجية أيضًا باسم التكاثر المعتدل أو الليسوجيني، وتتميز بما يلي:', '- **مفاصل غضروفية:** متحد نوعًا ما بنسيج غضروفي، على سبيل المثال، الارتفاق العاني.', '* **الأعضاء الليمفاوية:**\n    * نخاع العظم: ينتج الخلايا الليمفاوية الجذعية.\n    * العقد الليمفاوية: تصفي الليمف من مسببات الأمراض وتحتوي على خلايا بلازمية ومساعِدة وقتالة.']

