In [32]:
import json
import numpy as np
import uuid
import pinecone
from typing import List, Tuple
import src.config as config
import logging
import time
import openai
from lingua import Language, LanguageDetectorBuilder
from lingua import Language, LanguageDetectorBuilder


In [None]:
class DataLoader:
    def __init__(self, data_file: str):
        self.data_file = data_file
        self.quran_data = None
        self.embeddings = []
        self.metadata = []

    def load_data(self):
        with open(self.data_file, "r") as f:
            self.quran_data = json.load(f)

    def process_data(self):
        for verse in self.quran_data:
            self.embeddings.append(verse.get("embedding"))
            self.metadata.append(self.encode_metadata_in_id({
                "surah": verse.get("surah"),
                "ayah": verse.get("ayah"),
                "text_ar": verse.get("text_ar"),
                "text_en": verse.get("text_en")
            }))

    @staticmethod
    def encode_metadata_in_id(metadata: dict) -> str:
        return json.dumps(metadata, ensure_ascii=False, indent=4)

    @staticmethod
    def decode_metadata_from_id(item_id: str) -> dict:
        return json.loads(item_id)

In [2]:
class PineconeManager:
    def __init__(self, api_key: str, index_name: str, dimension: int, create_index: bool = False):
        self.api_key = api_key
        self.index_name = index_name
        self.dimension = dimension
        pinecone.init(api_key=self.api_key)
        if create_index:
            pinecone.create_index(name=self.index_name, metric="cosine", dimension=self.dimension)
        self.index = pinecone.Index(index_name=self.index_name)
        self.metadata_storage = {}

    def index_data(self, metadata, embeddings):
        for verse_meta, embedding in zip(metadata, embeddings):
            print(f"Uploading : {verse_meta}")
            short_id = self.generate_short_id()
            self.metadata_storage[short_id] = verse_meta
            self.index.upsert([(short_id, embedding)])


    def query_index(self, vector, top_k=20):
        search_results = self.index.query(vector=vector, top_k=top_k)
        return search_results

    def save_metadata_storage(self, file_name: str):
        with open(file_name, 'w', encoding='utf-8') as f:
            json.dump(self.metadata_storage, f, ensure_ascii=False, indent=4)

    def load_metadata_storage(self, file_name: str) -> dict:
        with open(file_name, 'r', encoding='utf-8') as f:
            metadata_storage = json.load(f)
        self.metadata_storage = metadata_storage
    
    @staticmethod
    def generate_short_id():
        return str(uuid.uuid4())[:8]

In [38]:
class OpenAIManager:
    def __init__(self):
        openai.api_key = config.OPENAI_KEY
        self.languages = [Language.ENGLISH, Language.ARABIC]
        self.lang_detector = LanguageDetectorBuilder.from_languages(*self.languages).build()

    def gpt3_embedding(self, content: str, engine: str = 'text-embedding-ada-002') -> list:
        try:
            response = openai.Embedding.create(input=content, engine=engine)
            vector = response['data'][0]['embedding']
            return vector
        except Exception as e:
            logging.error(f'Embedding failed. Error message: {e}')

    def extract_embedding(self, text: str) -> list:
        try:
            embedding = self.gpt3_embedding(text)
        except:
            while True:
                try:
                    if len(text) > 8191:
                        logging.warning('[OPENAI ERROR] Trying to get shorter input < 8191 for text...')
                        embedding = self.gpt3_embedding(text[:8191])
                    else:
                        embedding = self.gpt3_embedding(text)
                    break
                except Exception as e:
                    logging.error(f'Trying to get the embedding for text. Error message: {e}')
                    time.sleep(5)
        return embedding
    
    def translate(self, text: str, source_lang: str = "ar", target_lang: str = "en", model_engine: str = "text-davinci-002") -> str:
        prompt = f"Translate the following {source_lang} text to {target_lang}: {text}"
        response = openai.Completion.create(
            engine=model_engine,
            prompt=prompt,
            max_tokens=100,
            n=1,
            stop=None,
            temperature=0,
        )

        translated_text = response.choices[0].text.strip()
        print(f"Translated text: {translated_text}")
        return translated_text

In [40]:
def main():
    # Load data
    data_loader = DataLoader("./data/quran_GPT_embeddings_.json")
    data_loader.load_data()
    data_loader.process_data()

    # Pinecone manager
    pinecone_api_key = config.PINECONE_KEY
    index_name = "gpt"
    dimension = 1536

    # if you want to create index make sure to set create_index=True
    pinecone_manager = PineconeManager(pinecone_api_key, index_name, dimension, create_index=False)
    # if you want to index data uncomment the following line
    # pinecone_manager.index_data(data_loader.metadata, data_loader.embeddings)
    # pinecone_manager.save_metadata_storage("metadata_storage.json")
    pinecone_manager.load_metadata_storage("data/metadata_storage.json")

    # OpenAI manager
    openai_manager = OpenAIManager()
    
    # Query example
    input_text  = input("Enter your query: ")
    if openai_manager.lang_detector.detect_language_of(input_text) == Language.ARABIC:
        print("Translating to English...")
        input_text = openai_manager.translate(input_text)
        
    input_query_embedding = openai_manager.extract_embedding(input_text)

    # Query index
    search_results = pinecone_manager.query_index(input_query_embedding, top_k=20)

    for item in search_results['matches']:
        item_id = item['id']
        item_metadata = pinecone_manager.metadata_storage[item_id]
        similarity = item['score']
        print(f"ID: {item_id}, Similarity: {similarity}, Metadata: {item_metadata}")


if __name__ == "__main__":
    main()


Translating to English...
Translated text: ء

All praise is due to Allah for everything.
ID: c564acb1, Similarity: 0.92330569, Metadata: {
    "surah": 1,
    "ayah": 2,
    "text_ar": "الحمد لله رب العالمين",
    "text_en": "All praise unto Allah, the Lord of all the worlds."
}
ID: ecbd8473, Similarity: 0.91099894, Metadata: {
    "surah": 37,
    "ayah": 182,
    "text_ar": "والحمد لله رب العالمين",
    "text_en": "And all praise Unto Allah the Lord of the worlds."
}
ID: 67d461cd, Similarity: 0.888787389, Metadata: {
    "surah": 34,
    "ayah": 1,
    "text_ar": "الحمد لله الذي له ما في السماوات وما في الأرض وله الحمد في الآخرة وهو الحكيم الخبير",
    "text_en": "All praise Unto Allah whose is whatsoever is in the heavens and whatsoever is in the earth; and His is the praise in the Hereafter And He is the Wise, the Aware."
}
ID: 5b226c4a, Similarity: 0.887375116, Metadata: {
    "surah": 45,
    "ayah": 36,
    "text_ar": "فلله الحمد رب السماوات ورب الأرض رب العالمين",
    "text_en"

In [22]:
import os
import openai

openai.api_key = config.OPENAI_KEY



In [26]:
translate(" هل الحجاب فرض", "ar", "en")

'في الإسلام؟\n\nIs the hijab mandatory in Islam?'