# Libraries

In [42]:
import pandas as pd
import requests
import json
from typing import List, Optional
from pydantic import BaseModel, Field, ValidationError
from glob import glob

from retry import retry
import streamlit as st
import IPython
from tqdm import tqdm
import base64

from openai import OpenAI
client = OpenAI(api_key=st.secrets["OPENAI_API_KEY"])
OPENAI_MODEL = "gpt-4o-mini"

In [2]:
df = pd.read_csv("../data/processed/chinese_keywords.csv")
df['word'] = df['word'].apply(lambda x: "".join(x.split()))
word_list = df['word'].tolist()

# Steps

for each word:
- get the openai inputs
- get the google translate inputs
- get the TTS inputs


```json
[
  {
    "word": "辛苦",
    "pinyin": "xīnkǔ",
    "translation": "Hard",
    "pronounciation_audio_bytes": "RIFF~vx",
    "definitions": [
      "辛苦 is a Chinese word that you would use to express 'hard work', 'tiredness', or 'exertion'.",
      "辛苦 is also used as a way to say 'take care', 'be well', or 'have a good rest' when someone has been working hard."
    ],
    "example_usages": [
      {
        "chinese": "他每天工作很辛苦。",
        "pinyin": "Tā měitiān gōngzuò hěn xīnkǔ.",
        "english": "He works very hard every day.",
        "audio_bytes": "RIFF~vx"
      },
      {
        "chinese": "她辛苦地照顾三个孩子。",
        "pinyin": "Tā xīnkǔ de zhàogù sān gè háizi.",
        "english": "She worked hard to take care of her three children.",
        "audio_bytes": "RIFF~vx"
      },
      {
        "chinese": "辛苦了，谢谢你的帮助。",
        "pinyin": "Xīnkǔle, xièxiè nǐ de bāngzhù.",
        "english": "Thank you for your help.",
        "audio_bytes": "RIFF~vx"
      }
    ]
  }
]
```

# Helper Functions & Validators

In [3]:
class OpenAIValidator(BaseModel):
    definitions: List[str] = Field(min_items=1, max_items=3)
    example_usages: List[str] = Field(min_items=3, max_items=3)

In [4]:
@retry(ValidationError, tries=5, delay=3)
def openai_process(chinese_word):
    user_prompt = f"""
    # Instuction
    Generate a response in a structured JSON format that interprets the traditional Chinese word {chinese_word}. Include a list of definitions and a list of sentences as example usages. Ensure that:

    # JSON Output Requirements
    - The "definitions" key contains 1 to 3 entries that explain the word in English.
    - The "example_usages" key contains exactly 3 sentences demonstrating the use of the word in traditional Chinese sentences.

    # Example output
    - Below is an example output for the traditional Chinese word "辛苦"

    ```json
    [
    {{
        "definitions": [
        "辛苦 is a Chinese word that you would use to express 'hard work', 'tiredness', or 'exertion'.",
        "辛苦 is also used as a way to say 'take care', 'be well', or 'have a good rest' when someone has been working hard."
        ],
        "example_usages": [
        "他每天工作很辛苦。",
        "她辛苦地照顾三个孩子。",
        "辛苦了，谢谢你的帮助。",
        ]
    }}
    ]
    ```
    """
    response = client.chat.completions.create(
    model=OPENAI_MODEL,
    response_format={ "type": "json_object" },
    messages=[
        {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
        {"role": "user", "content": user_prompt}
    ]
    )
    
    response_string = response.choices[0].message.content
    try:
        #mytest = OpenAIValidator(**mydata)
        openai_json_validate = OpenAIValidator.model_validate_json(response_string)
        return openai_json_validate.model_dump()
    except ValidationError as e:
        print(str(e))
        raise e


In [5]:
@retry(requests.exceptions.HTTPError, tries=5, delay=10)
def translate(text, from_lang, to_lang):
    # Base URL of the translation API
    url = "https://api.datpmt.com/api/v1/dictionary/translate"

    # Parameters for the GET request
    params = {
        'string': text,
        'from_lang': from_lang,
        'to_lang': to_lang
    }

    # Make the GET request to the translation API
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        json_response = response.json()
        return json_response # Adjust the key based on actual API response structure
    else:
        #return "Error: " + response.text
        print("Error Translation: " + response.text)
        raise requests.exceptions.HTTPError

In [6]:
@retry(requests.exceptions.HTTPError, tries=5, delay=10)
def transliterate(keyword, from_lang):
    # Base URL of the transliteration API
    url = "https://api.datpmt.com/api/v1/dictionary/transliteration"

    # Parameters for the GET request
    params = {
        'keyword': keyword,
        'from_lang': from_lang
    }

    # Make the GET request to the transliteration API
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        json_response = response.json()
        return json_response  # Adjust this key based on the actual API response structure
    else:
        print("Error Transliteration: " + response.text)
        raise requests.exceptions.HTTPError
        #return "Error: " + response.text

In [7]:
def text_to_speech(chinese_sentence):
    # Define the URL to your FastAPI endpoint
    url = "http://localhost:80/synthesize"

    # The data to be sent in the POST request
    data = {
        "language": "ZH",  # Replace with the language of your choice
        "speaker": "ZH",  # Replace with speaker of choice. English British accent.
        "text": chinese_sentence,
        "speed": 0.7, # must be a positive number
    }

    # Set the appropriate headers
    headers = {
        "Content-Type": "application/json"
    }
    # Make the POST request
    response = requests.post(url, data=json.dumps(data), headers=headers)
    return response.content

# Workflow

In [None]:
for chinese_word in tqdm(word_list):
    print(f"processing chinese word: {chinese_word}")
    response_dict = openai_process(chinese_word)
    response_dict["word"] = chinese_word
    response_dict["pinyin"] = transliterate(chinese_word, from_lang="zh-TW")
    response_dict["translation"] = translate(chinese_word, from_lang="zh-TW", to_lang="en")
    response_dict["pronounciation_audio_bytes"] = base64.b64encode(text_to_speech(chinese_word)).decode('utf-8')
    usage_list = []
    
    for chinese_sentence in response_dict['example_usages']:
        usage_dict = {}
        usage_dict["chinese"] = chinese_sentence
        usage_dict["pinyin"] = transliterate(chinese_sentence, from_lang="zh-TW")
        usage_dict["english"] = translate(chinese_sentence, from_lang="zh-TW", to_lang="en")
        usage_dict["audio_bytes"] = base64.b64encode(text_to_speech(chinese_sentence)).decode('utf-8')
        usage_list.append(usage_dict)
        #break
    response_dict["example_usages"] = usage_list
    

    # Convert and write JSON object to file
    with open(f"../data/output/keywords/{chinese_word}.json", "w") as outfile: 
        json.dump(response_dict, outfile)
    print(f"successfully output .json for {chinese_word}")
    #break

In [9]:
response_dict

{'definitions': ["強基金 refers to a 'strong financial fund' or 'robust investment fund' that is designed to provide good returns and withstand market fluctuations.",
  'In the context of investing, 強基金 indicates a fund with solid management and a reliable track record.',
  '強基金 can also refer to funds that are specifically focused on high growth potential assets.'],
 'example_usages': [{'chinese': '這個強基金的表現超過了市場平均水平。',
   'pinyin': 'Zhège qiáng jījīn de biǎoxiàn chāoguòle shìchǎng píngjūn shuǐpíng.',
   'english': 'This strong fund has outperformed the market average.',
   'audio_bytes': 'UklGRsBwBgBXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YZxwBgD//wAAAAAAAAAA//////////8AAAAAAAAAAP//AAAAAAAA//8AAP////////////8AAP//////////////////////////////////////////////////////////////////////////////////AAD///////8AAP////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [109]:
# when in app, decode it like this
IPython.display.Audio(base64.b64decode(json_response_dict["example_usages"][0]["audio_bytes"].encode('utf-8')))

# Pydantic Consumption for App

In [90]:
with open("../data/output_compiled/keywords/keywords.json") as json_file:
        loaded_keywords = json.load(json_file)

In [91]:
class ExampleUsage(BaseModel):
    chinese: str
    pinyin: str
    english: str
    audio_bytes: str

class WordEntry(BaseModel):
    word: str
    pinyin:str
    translation: str
    pronounciation_audio_bytes: str
    definitions: List[str]
    example_usages: List[ExampleUsage]

# Declaring the full list type that holds multiple word entries.
class DictionaryEntries(BaseModel):
    entries: List[WordEntry]

In [92]:
# Deserializing the JSON data into Pydantic models
chinese_keywords = DictionaryEntries(entries=[WordEntry(**entry) for entry in loaded_keywords])

In [101]:
chinese_keywords.entries[0].example_usages[0]

ExampleUsage(chinese='我一會兒就回來，不要等我。', pinyin="Wǒ yīhuǐ'er jiù huílái, bùyào děng wǒ.", english="I'll be back in a moment, don't wait for me.", audio_bytes='UklGRsB4BABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YZx4BAD//wAAAAAAAAAA//////////8AAAAAAAAAAP//AAAAAAAA//////////////////8AAP//////////////////////////////////////////////////////////////////////////////////AAD/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

In [105]:
IPython.display.Audio(base64.b64decode(chinese_keywords.entries[100].example_usages[0].audio_bytes.encode('utf-8')))

# Consolidating json

In [40]:
compiled_keywords = []
for json_word in tqdm(glob("../data/output/keywords/**.json")):
    # Opening JSON file
    with open(json_word) as json_file:
        compiled_keywords.append(json.load(json_file))


100%|██████████| 209/209 [00:03<00:00, 67.41it/s]


In [89]:
# Convert and write JSON object to file
with open(f"../data/output_compiled/keywords/keywords.json", "w") as outfile: 
    json.dump(loaded_keywords, outfile)

In [47]:
with open("../data/output_compiled/keywords/keywords.json") as json_file:
        loaded_keywords = json.load(json_file)

In [58]:
for chinese_word in tqdm(loaded_keywords):
    chinese_word["pinyin"] = transliterate(chinese_word["word"], from_lang="zh-TW")


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
100%|██████████| 209/209 [00:43<00:00,  4.76it/s]


In [59]:
# Convert and write JSON object to file
with open(f"../data/output_compiled/keywords/loaded_keywords.json", "w") as outfile: 
    json.dump(loaded_keywords, outfile)

In [62]:
loaded_keywords[-7]["pinyin"]

'Xūqiú'