In [None]:
import os
import json
import traceback
import glob
import re
import csv
from datetime import datetime
from openai import OpenAI

class JSONParsingError(Exception):
    def __init__(self, message, json_string, text):
        super().__init__(message)
        self.message = message
        self.json_string = json_string
        self.text = text

class OpenAIRequestBase:
    def __init__(self, use_cache=True, max_retries=3, cache_dir='cache'):
        self.client = OpenAI()  # Assume correct initialization with API key
        self.max_retries = max_retries
        self.use_cache = use_cache
        self.cache_dir = cache_dir
        self.ensure_dir_exists(self.cache_dir)

    def ensure_dir_exists(self, path):
        if not os.path.exists(path):
            os.makedirs(path)

    def get_cache_file_path(self, prompt):
        filename = f"{abs(hash(prompt))}.json"
        return os.path.join(self.cache_dir, filename)

    def save_to_cache(self, prompt, response):
        file_path = self.get_cache_file_path(prompt)
        with open(file_path, 'w', encoding='utf-8') as file:
            json.dump({"prompt": prompt, "response": response}, file, ensure_ascii=False, indent=4)

    def load_from_cache(self, prompt):
        file_path = self.get_cache_file_path(prompt)
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as file:
                cached_data = json.load(file)
                return cached_data["response"]
        return None

    def send_request_with_retry(self, prompt, system_content="You are an AI."):
        retries = 0
        # messages = [{"role": "system", "content": system_content}, {"role": "user", "content": prompt}]
        messages = [
            {"role": "system", "content": system_content},
            {"role": "user", "content": prompt}
        ]

        if self.use_cache:
            cached_response = self.load_from_cache(prompt)
            if cached_response:
                return cached_response

        while retries < self.max_retries:
            try:
                response = self.client.chat.completions.create(
                    model=os.environ.get("OPENAI_MODEL", "gpt-4-0125-preview"),
                    messages=messages
                )
                ai_response = response.choices[0].message.content.strip()
                parsed_response = self.parse_response(ai_response)
                self.save_to_cache(prompt, parsed_response)
                return parsed_response
            except Exception as e:
                traceback.print_exc()
                retries += 1
                messages.append({"role": "system", "content": ai_response})
                messages.append({"role": "system", "content": str(e)})

        raise Exception("Maximum retries reached without success.")

    def parse_response(self, response):
        first_dict_index = response.find('{')
        first_list_index = response.find('[')
        if first_dict_index == -1 and first_list_index == -1:
            raise JSONParsingError("No JSON structure found.", response, response)
        
        if (first_dict_index < first_list_index) or (first_list_index == -1):
            parse_pattern = r'\{.*\}'
        else:
            parse_pattern = r'\[.*\]'

        matches = re.findall(parse_pattern, response, re.DOTALL)
        if not matches:
            raise JSONParsingError("No matching JSON structure found.", response, response)

        json_string = matches[0]
        try:
            return json.loads(json_string)
        except json.JSONDecodeError as e:
            raise JSONParsingError("Failed to decode JSON.", json_string, response)

class WordEtymologyAnalyzer(OpenAIRequestBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.processed_log = 'processed_words.csv'
        self.ensure_dir_exists(self.cache_dir)
        self.ensure_processed_log_exists()

    def ensure_processed_log_exists(self):
        if not os.path.exists(self.processed_log):
            with open(self.processed_log, 'w', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(['word', 'file_path'])

    def record_processed_word(self, word, response):
        file_path = self.get_cache_file_path(word)
        with open(self.processed_log, 'a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([word, file_path])

    def analyze_word_etymology(self, word, system_content="You are a linguistic expert analyzing words."):
        prompt = (
            f"Conduct a comprehensive analysis of the word '{word}', starting with its meaning, example_words, language, "
            "synonyms in Japanese, Arabic, French and Chinese, and provide an overview of its etymology. Dissect the word "
            "to identify any prefixes, roots, and suffixes it may contain. For the etymology of each identified part, provide "
            "meaning, example_words, the language of origin, a history and tracing and further etymology.\n\n"
            "Delve deeper by RECURSIVELY (to the extreme, try your best, at least 3) tracing the origins of these components "
            "to the extreme, ensuring to document the lineage of each part back to its absolute roots. Should any intermediate "
            "root have a multifaceted history, illuminate the various branches of its evolution. Present this information in a "
            "structured JSON format, adhering to the following template:\n\n"
            "Use ``` to indicates start and end, if not finished yet don't use it.Output only JSON:\n"
            "```json\n"
            "{\n"
            f"  \"word\": \"{word}\",\n"
            "  \"meaning\": \"\",\n"
            "  \"synonyms_in_other_languages\": {\"japanese\": \"\", \"arabic\": \"\", \"french\": \"\", \"chinese\": \"\"},\n"
            "  \"language\": \"\",\n"
            "  \"tracing\": [],\n"
            "  \"history\": \"<Insert historical overview here>\",\n"
            "  \"parts\": [\n"
            "    {\n"
            "      \"part\": \"<Name of part>\",\n"
            "      \"type\": \"<Type: prefix/root/suffix>\"\n"
            "    }\n"
            "  ],\n"
            "  \"etymology\": [\n"
            "    {\n"
            "      \"part\": \"<Name of part>\",\n"
            "      \"meaning\": \"<Meaning of the part>\",\n"
            "      \"example_words\": [],\n"
            "      \"language\": \"<Language of origin>\",\n"
            "      \"history\": \"<Historical background of the part>\",\n"
            "      \"tracing\": [\"<Detailed tracing of etymology: xxx <-- yyy <-- zzz <-- www ...>\", \"xxx <-- bbb <-- ccc <-- ddd...\"]"
            ",\n"
            "      \"etymology\": [\n"
            "        {\n"
            "          \"part\": \"<Sub-part name>\",\n"
            "          \"meaning\": \"<Meaning>\",\n"
            "          \"language\": \"<Language>\",\n"
            "          \"history\": \"<Historical background>\",\n"
            "          \"example_words\": [],\n"
            "          \"tracing\": [\"<Further tracing: yyy <-- zzz <-- www ...>\", \"yyy <-- ggg <-- hhh ...>\"]"
            ",\n"
            "          \"etymology\": [\n"
            "            {\n"
            "              \"part\": \"<Sub-sub-part name>\"\n"
            "              ...\n"
            "              \"etymology\": [\n"
            "                  ...\n"
            "              ]\n"
            "            }\n"
            "            ...\n"
            "          ]\n"
            "        }\n"
            "      ]\n"
            "    }\n"
            "    ...\n"
            "  ]\n"
            "}\n"
            "```"
        )
        response = self.send_request_with_retry(prompt, system_content)
        self.record_processed_word(word, response)
        return response

In [5]:
from pprint import pprint
if __name__ == "__main__":
    analyzer = WordEtymologyAnalyzer(use_cache=True)
    word = "etymology"
    try:
        analysis_result = analyzer.analyze_word_etymology(word, system_content="You are an expert in etymology.")
        pprint(analysis_result)
    except Exception as e:
        print(f"Error: {e}")

{'etymology': [{'etymology': [{'etymology': [],
                               'example_words': [],
                               'history': 'Ancient concept referring to the '
                                          'true meaning of a word, emphasizing '
                                          'the idea of uncovering the genuine '
                                          'essence behind the form of a word.',
                               'language': 'Ancient Greek',
                               'meaning': 'true sense, original meaning',
                               'part': 'etymon',
                               'tracing': []}],
                'example_words': ['etymon'],
                'history': "Derived from the Greek ‘etymon’, meaning 'true "
                           "sense' or 'original meaning', it underscores the "
                           'aspect of seeking the original or true sense '
                           'behind words.',
                'language': 'A