In [1]:
import json  # Import the JSON module for working with JSON data
import numpy as np  # Import NumPy, a library for working with arrays and matrices
import uuid  # Import the UUID module for generating UUIDs (Universally Unique Identifiers)
import pinecone  # Import the Pinecone library for working with vector similarity search
from typing import List, Tuple  # Import List and Tuple types for type hinting
import src.config as config  # Import a module called "config" from a subdirectory called "src"
import logging  # Import the logging module for logging messages
import time  # Import the time module for measuring time
import openai  # Import the OpenAI library for natural language processing
from lingua import Language, LanguageDetectorBuilder  # Import the Lingua library for language detection and identification

import os  # Import the os library for setting environment variables
from langchain.llms import OpenAI  # Import the OpenAI module from the langchain package for interacting with the OpenAI GPT-3 model
from langchain.prompts import PromptTemplate  # Import the PromptTemplate module from the langchain package for generating prompts for the OpenAI GPT-3 model



os.environ["OPENAI_API_KEY"] = config.OPENAI_KEY # Set the OPENAI_API_KEY environment variable to the OpenAI API key stored in the config module

  from tqdm.autonotebook import tqdm


In [2]:
class DataLoader:
    def __init__(self, data_file: str):
        """
        A class for loading and processing data from a JSON file.

        Args:
        - data_file (str): The path to the JSON file to load.
        """
        self.data_file = data_file
        self.quran_data = None  # Initialize Quran data to None
        self.embeddings = []  # Initialize an empty list for embeddings
        self.metadata = []  # Initialize an empty list for metadata

    def load_data(self):
        """
        Load Quran data from a JSON file and store it in the `quran_data` attribute.
        """
        with open(self.data_file, "r") as f:
            self.quran_data = json.load(f)

    def process_data(self):
        """
        Process the Quran data by extracting embeddings and metadata for each verse.
        """
        for verse in self.quran_data:
            self.embeddings.append(verse.get("embedding"))  # Add the verse's embedding to the embeddings list
            self.metadata.append(self.encode_metadata_in_id({
                "surah": verse.get("surah"),
                "ayah": verse.get("ayah"),
                "text_ar": verse.get("text_ar"),
                "text_en": verse.get("text_en")
            }))  # Add the verse's metadata, encoded in JSON format, to the metadata list

    @staticmethod
    def encode_metadata_in_id(metadata: dict) -> str:
        """
        Encode metadata in JSON format and return it as a string.

        Args:
        - metadata (dict): A dictionary containing metadata for a verse.

        Returns:
        - str: The encoded metadata in JSON format.
        """
        return json.dumps(metadata, ensure_ascii=False, indent=4)

    @staticmethod
    def decode_metadata_from_id(item_id: str) -> dict:
        """
        Decode metadata from a JSON string and return it as a dictionary.

        Args:
        - item_id (str): A string containing metadata in JSON format.

        Returns:
        - dict: The decoded metadata as a dictionary.
        """
        return json.loads(item_id)

In [3]:
class PineconeManager:
    def __init__(self, api_key: str, index_name: str, dimension: int, create_index: bool = False):
        """
        A class for managing a Pinecone index.

        Args:
        - api_key (str): The Pinecone API key to use.
        - index_name (str): The name of the Pinecone index to manage.
        - dimension (int): The dimensionality of the vectors to be indexed.
        - create_index (bool): Whether to create a new index with the specified name and dimensionality. Defaults to False.
        """
        self.api_key = api_key
        self.index_name = index_name
        self.dimension = dimension
        pinecone.init(api_key=self.api_key)  # Initialize the Pinecone API
        if create_index:
            pinecone.create_index(name=self.index_name, metric="cosine", dimension=self.dimension)  # Create a new index with the specified name and dimensionality
        self.index = pinecone.Index(index_name=self.index_name)  # Initialize the index object
        self.metadata_storage = {}  # Initialize an empty dictionary for storing metadata

    def index_data(self, metadata, embeddings):
        """
        Index data by adding metadata and embeddings to the Pinecone index.

        Args:
        - metadata (list): A list of metadata dictionaries, where each dictionary corresponds to a single vector.
        - embeddings (list): A list of embeddings, where each embedding corresponds to a single vector.
        """
        for verse_meta, embedding in zip(metadata, embeddings):
            print(f"Uploading : {verse_meta}")  # Print a message indicating that the metadata is being uploaded
            short_id = self.generate_short_id()  # Generate a short ID for the metadata
            self.metadata_storage[short_id] = verse_meta  # Add the metadata to the metadata storage dictionary
            self.index.upsert([(short_id, embedding)])  # Add the metadata and embedding to the Pinecone index

    def query_index(self, vector, top_k=20):
        """
        Query the Pinecone index with a vector and retrieve the top K most similar vectors.

        Args:
        - vector (list): A list representing a query vector.
        - top_k (int): The number of results to return. Defaults to 20.

        Returns:
        - list: A list of tuples, where each tuple contains a short ID and a similarity score.
        """
        search_results = self.index.query(vector=vector, top_k=top_k)  # Query the Pinecone index with the vector
        return search_results

    def save_metadata_storage(self, file_name: str):
        """
        Save the metadata storage dictionary to a file in JSON format.

        Args:
        - file_name (str): The name of the file to save the metadata to.
        """
        with open(file_name, 'w', encoding='utf-8') as f:
            json.dump(self.metadata_storage, f, ensure_ascii=False, indent=4)

    def load_metadata_storage(self, file_name: str) -> dict:
        """
        Load the metadata storage dictionary from a file in JSON format.

        Args:
        - file_name (str): The name of the file to load the metadata from.

        Returns:
        - dict: The metadata storage dictionary.
        """
        with open(file_name, 'r', encoding='utf-8') as f:
            metadata_storage = json.load(f)
        self.metadata_storage = metadata_storage
    
    @staticmethod
    def generate_short_id():
        """
        Generate a short, random ID.

        Returns:
        - str: The generated ID.
        """
        return str(uuid.uuid4())[:8]  # Generate a random UUID and return the first 8 characters as the short ID.


In [4]:
class OpenAIManager:
    def __init__(self):
        """
        Initializes the OpenAIManager class with pre-defined languages, language detector,
        OpenAI model, and a prompt template for translation.
        """
        
        self.languages = [Language.ENGLISH, Language.ARABIC]
        self.lang_detector = LanguageDetectorBuilder.from_languages(*self.languages).build()
        self.openai_llm = OpenAI(temperature=0)
        self.prompt = PromptTemplate(
            input_variables=["query"],
            template="Translate the following arabic text into english : {query}"
        )
        
    def gpt3_embedding(self, content: str, engine: str = 'text-embedding-ada-002') -> list:
        """
        Generates an embedding for the input content using OpenAI GPT-3.

        Args:
            content (str): The input content for which the embedding is required.
            engine (str, optional): The engine to be used for generating the embedding. Defaults to 'text-embedding-ada-002'.

        Returns:
            list: The generated embedding as a list of floats.
        """
        try:
            response = openai.Embedding.create(input=content, engine=engine)
            vector = response['data'][0]['embedding']
            return vector
        except Exception as e:
            logging.error(f'Embedding failed. Error message: {e}')

    def extract_embedding(self, text: str) -> list:
        """
        Extracts the embedding for the given text using GPT-3.

        Args:
            text (str): The input text for which the embedding is required.

        Returns:
            list: The extracted embedding as a list of floats.
        """
        try:
            embedding = self.gpt3_embedding(text)
        except:
            while True:
                try:
                    if len(text) > 8191:
                        logging.warning('[OPENAI ERROR] Trying to get shorter input < 8191 for text...')
                        embedding = self.gpt3_embedding(text[:8191])
                    else:
                        embedding = self.gpt3_embedding(text)
                    break
                except Exception as e:
                    logging.error(f'Trying to get the embedding for text. Error message: {e}')
                    time.sleep(5)
        return embedding
    
    def translate(self, text: str) -> str:
        """
        Translates the input Arabic text to English using the OpenAI model.

        Args:
            text (str): The input Arabic text to be translated.

        Returns:
            str: The translated English text.
        """
        
        translated_text = self.openai_llm(self.prompt.format(query=text))
        return translated_text


In [5]:
def main():
    # Load data from the specified file and process it
    data_loader = DataLoader("./data/quran_GPT_embeddings_.json")
    data_loader.load_data()
    data_loader.process_data()

    # Initialize Pinecone manager with API key, index name, and dimension
    pinecone_api_key = config.PINECONE_KEY
    index_name = "gpt"
    dimension = 1536

    # Create an instance of PineconeManager and set create_index to True if you want to create a new index
    pinecone_manager = PineconeManager(pinecone_api_key, index_name, dimension, create_index=False)
    
    # Index the data and save metadata to storage (uncomment the following lines if needed)
    # pinecone_manager.index_data(data_loader.metadata, data_loader.embeddings)
    # pinecone_manager.save_metadata_storage("metadata_storage.json")

    # Load metadata storage from the specified file
    pinecone_manager.load_metadata_storage("data/metadata_storage.json")

    # Initialize OpenAI manager
    openai_manager = OpenAIManager()
    
    # Get user input and detect its language; if it's Arabic, translate it to English
    input_text  = input("Enter your query: ")
    if openai_manager.lang_detector.detect_language_of(input_text) == Language.ARABIC:
        input_text = openai_manager.translate(input_text)
        
    # Extract the embedding for the input text
    input_query_embedding = openai_manager.extract_embedding(input_text)

    # Query the Pinecone index using the extracted embedding and retrieve the top 5 results
    search_results = pinecone_manager.query_index(input_query_embedding, top_k=5)
    for item in search_results['matches']:
        item_id = item['id']
        item_metadata = pinecone_manager.metadata_storage[item_id]
        similarity = item['score']
        print(f"ID: {item_id}, Similarity: {similarity}, Metadata: {item_metadata}")

if __name__ == "__main__":
    main()


ID: 8348fc0d, Similarity: 0.847318172, Metadata: {
    "surah": 7,
    "ayah": 196,
    "text_ar": "إن وليي الله الذي نزل الكتاب وهو يتولى الصالحين",
    "text_en": "Verily my protector is Allah who hath revealed the Book, and He protecteth the righteous."
}
ID: b517106a, Similarity: 0.831350386, Metadata: {
    "surah": 48,
    "ayah": 3,
    "text_ar": "وينصرك الله نصرا عزيزا",
    "text_en": "And that Allah may succour thee with a mighty succour."
}
ID: b4722a9c, Similarity: 0.827713251, Metadata: {
    "surah": 44,
    "ayah": 42,
    "text_ar": "إلا من رحم الله إنه هو العزيز الرحيم",
    "text_en": "Save those on whom Allah will have mercy. Verily He! He is the Mighty, the Merciful."
}
ID: ec4b6d0e, Similarity: 0.827417135, Metadata: {
    "surah": 16,
    "ayah": 81,
    "text_ar": "والله جعل لكم مما خلق ظلالا وجعل لكم من الجبال أكنانا وجعل لكم سرابيل تقيكم الحر وسرابيل تقيكم بأسكم كذلك يتم نعمته عليكم لعلكم تسلمون",
    "text_en": "And Allah hath appointed for you, of that which