In [25]:
import csv
from pprint import pprint
from concurrent.futures import ThreadPoolExecutor
from openai_request import OpenAIRequestBase  # Assuming this class is correctly defined elsewhere
import re

class JapaneseSentenceProcessor(OpenAIRequestBase):
    def __init__(self, csv_file_path, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.csv_file_path = csv_file_path

    def process_csv(self):
        with open(self.csv_file_path, newline='', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)
            for idx, row in enumerate(reader):
                if idx == 100:  # Adjust as needed for your specific condition
                    translations = self.process_row(row)
                    final_result = self.fetch_phonetic_pairs(translations)
                    return final_result

    def process_row(self, row):
        content = row['Content']
        prompt_translation = (
            f"Based on this Japanese text: {content}\n\n"
            "Combine it a full Japanese story. Translate it to English, Arabic, Chinese, and Cantonese. "
            "Format the translations in a JSON structure as shown below:\n\n"
            "```json\n"
            "{\n"
            "  \"ja\": \"Japanese text\",\n"
            "  \"en\": \"English translation\",\n"
            "  \"ar\": \"Arabic translation\",\n"
            "  \"zh\": \"Chinese translation\",\n"
            "  \"yue\": \"Cantonese translation\"\n"
            "}\n"
            "```"
        )
        translations = self.send_request_with_retry(prompt_translation, "translation")
        
        pprint(translations)
        
        return translations
    
    @staticmethod
    def annotate_kanji_katakana(translations):
        # Define the regular expression patterns for Kanji and Katakana
        kanji_pattern = r'[\u4e00-\u9faf]+'
        katakana_pattern = r'[\u30a0-\u30ff]+'
        
        # Function to wrap text in <>[]
        def replacer(match):
            return f'<{match.group(0)}>[]'
        

        # Annotate Kanji
        translations['ja'] = re.sub(kanji_pattern, replacer, translations['ja'])
        # Annotate Katakana
        translations['ja'] = re.sub(katakana_pattern, replacer, translations['ja'])
        
        return translations
    
    @staticmethod
    def convert_japanese_brackets_to_pairs(annotated_sentence):
        
        # Regex to match <kanji>[furigana] patterns
        pattern = re.compile(r"<([^>]+)>\[([^]]+)\]")
        # To hold the resulting pairs
        pairs = []
        # Tracks the last end index of matches to find non-matched parts
        last_end = 0

        # Function to add non-matched parts as their own pairs
        def add_non_matched_parts(start):
            nonlocal last_end
            if start > last_end:
                # Add non-matched parts (hiragana, punctuation, etc.) to pairs
                for char in annotated_sentence[last_end:start]:
                    pairs.append({"part": char, "phonetic": char})

        for match in pattern.finditer(annotated_sentence):
            start, end = match.span()
            # Add non-matched parts before the current match
            add_non_matched_parts(start)

            kanji, furigana = match.groups()
            # Add the matched kanji/katakana with furigana as a pair
            pairs.append({"part": kanji, "phonetic": furigana})

            last_end = end  # Update the last end index

        # Add any remaining non-matched parts after the last match
        add_non_matched_parts(len(annotated_sentence))

        # Construct the full sentence without annotations
        full_sentence = pattern.sub(r"\1", annotated_sentence)

        return {
            "full": annotated_sentence,
            "pairs": pairs
        }

    def fetch_phonetic_pairs(self, translations):
        translations = self.annotate_kanji_katakana(translations)
        
        def fetch_pairs_for_language(language, sentence):
            
            phonetic_key = {
                "ja": "furigana",
                "en": "phonetics",
                "ar": "phonetics",
                "zh": "pinyin",
                "yue": "jyutping"
            }
            
            if language == "ja":
                prompt_phonetic = (
                    f"Given the {language} sentence: \"{sentence}\", "
                    # f"provide a {phonetic_key[language]} phonetics breakdown for each word or character. "
                    # "Treat space and punctuation as a part. Keep duplicated part also in the list. "
                    "fill the furigana inside the brackets [] for  <kanji or katakana> and provide the updated result in 'full' key like <kanji>[furigana]. "
                    # "Assure the join of part is exactly the full sentence. "
                    "Format the output in JSON structure as shown below:\n\n"
                    "```json\n"
                    "{\n"
                    "  \"full\": \"{sentence}\",\n"
                    # "  \"pairs\": [\n"
                    # "    {\n"
                    # "      \"part\": \"Example word or character\",\n"
                    # "      \"phonetic\": \"Example phonetic representation\"\n"
                    # "    }\n"
                    # "  ]\n"
                    "}\n"
                    "```"
                )
            else:
                prompt_phonetic = (
                    f"Given the {language} sentence: \"{sentence}\", "
                    f"provide a {phonetic_key[language]} phonetics breakdown for each word or character. "
                    "Treat space and punctuation as a part. Keep duplicated part also in the list. "
                    "Assure the join of part is exactly the full sentence. "
                    "Format the output in JSON structure as shown below:\n\n"
                    "```json\n"
                    "{\n"
                    # "  \"full\": \"{sentence}\",\n"
                    "  \"pairs\": [\n"
                    "    {\n"
                    "      \"part\": \"Example word or character\",\n"
                    "      \"phonetic\": \"Example phonetic representation\"\n"
                    "    }\n"
                    "  ]\n"
                    "}\n"
                    "```"
                )
            return self.send_request_with_retry(prompt_phonetic, language)
        
            

        with ThreadPoolExecutor() as executor:
            futures = {lang: executor.submit(fetch_pairs_for_language, lang, translations[lang]) for lang in translations}
            
            for lang, future in futures.items():
                phonetic_data = future.result()
                
                pprint(phonetic_data)
                
                if lang == "ja":
                    phonetic_data = self.convert_japanese_brackets_to_pairs(phonetic_data["full"])
                    
                    translations[lang] = {
                        "full": phonetic_data["full"],
                        "pairs": phonetic_data['pairs']
                    }
                else:
                
                    translations[lang] = {
                        "full": translations[lang],
                        "pairs": phonetic_data['pairs']
                    }

        return translations


import json
if __name__ == "__main__":
    csv_file_path = 'japanese_language_data.csv'  # Update this path to your CSV file
    processor = JapaneseSentenceProcessor(csv_file_path)
    response = processor.process_csv()
    pprint(response)
    with open("translations.json", "w") as fd:
        fd.write(json.dumps(response, indent=2, ensure_ascii=False))

{'ar': 'في يوم ما، كان لدي قميص أزرق، وقميص أحمر، وتيشرت أخضر، وتيشرت أصفر. ثم '
       'سُئلت عن عدد الأكواب التي كانت لدي. كان لدي أربعة أكواب. بعد ذلك، '
       'سُئلت عن عدد السندويشات التي كانت لدي. كان لدي خمسة سندويشات.',
 'en': 'One day, I had a blue shirt, a red shirt, a green T-shirt, and a '
       'yellow T-shirt. Then, I was asked how many cups I had. I had four '
       'cups. Next, I was asked how many sandwiches I had. I had five '
       'sandwiches.',
 'ja': 'ある日、私は青いシャツ、赤いシャツ、緑のTシャツ、黄色のTシャツを持っていました。そして、私はカップを何個持っていますかと尋ねられました。カップは四個持っています。次に、サンドイッチを何個持っていますかと尋ねられました。サンドイッチは五個持っています。',
 'yue': '有一日，我有件藍色嘅襯衫、件紅色嘅襯衫、件綠色嘅T恤同件黃色嘅T恤。跟住，有人問我有幾多個杯。我有四個杯。之後，有人問我有幾多個三文治。我有五個三文治。',
 'zh': '有一天，我有一件蓝色的衬衫、一件红色的衬衫、一件绿色的T恤和一件黄色的T恤。然后，有人问我有多少个杯子。我有四个杯子。接下来，有人问我有多少个三明治。我有五个三明治。'}
{'full': 'ある<日>[ひ]、<私>[わたし]は<青>[あお]い<シャツ>[しゃつ]、<赤>[あか]い<シャツ>[しゃつ]、<緑>[みどり]のT<シャツ>[しゃつ]、<黄色>[きいろ]のT<シャツ>[しゃつ]を<持>[も]っていました。そして、<私>[わたし]は<カップ>[かっぷ]を<何個持>[なんこも]っていますかと<尋>[たず]ねられました。<カップ>[かっぷ]は<四個持>[よんこも]っています。<