In [None]:
import json
import re
import math
import os
from nltk.corpus import stopwords
from pyarabic.araby import strip_diacritics
from lingua import Language, LanguageDetectorBuilder
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from tqdm import tqdm
import logging

In [None]:
import openai

class OpenAIManager:
    def __init__(self):
        """
        Initializes the OpenAIManager class with pre-defined languages, language detector,
        OpenAI model, and a prompt template for translation.
        """
        
        self.languages = [Language.ENGLISH, Language.ARABIC]
        self.lang_detector = LanguageDetectorBuilder.from_languages(*self.languages).build()
        self.openai_llm = OpenAI(temperature=0)
        self.prompt = PromptTemplate(
            input_variables=["query"],
            template="Translate the following arabic text into english : {query}"
        )
        
    def gpt3_embedding(self, content: str, engine: str = 'text-embedding-ada-002') -> list:
        """
        Generates an embedding for the input content using OpenAI GPT-3.

        Args:
            content (str): The input content for which the embedding is required.
            engine (str, optional): The engine to be used for generating the embedding. Defaults to 'text-embedding-ada-002'.

        Returns:
            list: The generated embedding as a list of floats.
        """
        try:
            response = openai.Embedding.create(input=content, engine=engine)
            vector = response['data'][0]['embedding']
            return vector
        except Exception as e:
            logging.error(f'Embedding failed. Error message: {e}')

    def extract_embedding(self, text: str) -> list:
        """
        Extracts the embedding for the given text using GPT-3.

        Args:
            text (str): The input text for which the embedding is required.

        Returns:
            list: The extracted embedding as a list of floats.
        """
        try:
            embedding = self.gpt3_embedding(text)
        except:
            while True:
                try:
                    if len(text) > 8191:
                        logging.warning('[OPENAI ERROR] Trying to get shorter input < 8191 for text...')
                        embedding = self.gpt3_embedding(text[:8191])
                    else:
                        embedding = self.gpt3_embedding(text)
                    break
                except Exception as e:
                    logging.error(f'Trying to get the embedding for text. Error message: {e}')
                    time.sleep(5)
        return embedding
    
    def translate(self, text: str) -> str:
        """
        Translates the input Arabic text to English using the OpenAI model.

        Args:
            text (str): The input Arabic text to be translated.

        Returns:
            str: The translated English text.
        """
        
        translated_text = self.openai_llm(self.prompt.format(query=text))
        return translated_text

In [None]:
os.environ["OPENAI_API_KEY"] = "Add your openai key"
openai_manager = OpenAIManager()

In [None]:
def preprocess_english_text(text):
    """
    Preprocess English text by converting to lowercase and removing non-alphanumeric characters.
    
    Args:
        text (str): The input text to preprocess.
    
    Returns:
        str: The preprocessed text.
    """
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r"[^\w\s]", "", text)
    return text

In [None]:
def clean_json_data(json_file):
    """
    Clean JSON data and extract GPT embeddings for each record.
    
    Args:
        json_file (str): The input JSON file containing the data to clean and process.
    """
    with open(json_file, "r", encoding="utf-8") as file:
        data = json.load(file)

    cleaned_data = []
    
    for record in tqdm(data):
        cleaned_record = record
        # Extract the embedding for the input text
        cleaned_record["embeddings"] = openai_manager.extract_embedding(preprocess_english_text("english_translation"))
        cleaned_data.append(cleaned_record)

    with open("quran_GPT_embeddings.json", "w") as output_file:
        json.dump(cleaned_data, output_file, ensure_ascii=False, indent=4)

In [None]:
def main():
    json_file = "quran/quran.json"
    clean_json_data(json_file)

In [None]:
if __name__ == "__main__":
    main()