# Libraries

In [1]:
import pandas as pd
import requests
import json
from typing import List
from pydantic import BaseModel, Field, ValidationError

import streamlit as st

from openai import OpenAI

In [2]:
df = pd.read_csv("../data/processed/chinese_keywords.csv")
df['word'] = df['word'].apply(lambda x: "".join(x.split()))

# Translate

In [7]:

def translate(text, from_lang, to_lang):
    # Base URL of the translation API
    url = "https://api.datpmt.com/api/v1/dictionary/translate"

    # Parameters for the GET request
    params = {
        'string': text,
        'from_lang': from_lang,
        'to_lang': to_lang
    }

    # Make the GET request to the translation API
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        json_response = response.json()
        return json_response # Adjust the key based on actual API response structure
    else:
        return "Error: " + response.text

# Usage
text_to_translate = "他每天工作很辛苦。"
from_language = "zh-TW"
to_language = "en"
translated_text = translate(text_to_translate, from_language, to_language)

print("Translated Text:", translated_text)

Translated Text: He works very hard every day.


# Transliterate

In [6]:
def transliterate(keyword, from_lang):
    # Base URL of the transliteration API
    url = "https://api.datpmt.com/api/v1/dictionary/transliteration"

    # Parameters for the GET request
    params = {
        'keyword': keyword,
        'from_lang': from_lang
    }

    # Make the GET request to the transliteration API
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        json_response = response.json()
        return json_response  # Adjust this key based on the actual API response structure
    else:
        return "Error: " + response.text

# Usage
chinese_text = "他每天工作很辛苦。"
from_language = "zh-TW"
pinyin_text = transliterate(chinese_text, from_language)

print("Transliterated Text (Pinyin):", pinyin_text)

Transliterated Text (Pinyin): Error: {"error":"Our systems have detected unusual traffic from your computer network. Please try your request again later."}


In [48]:
word_list = df['word'].tolist()

In [52]:
[{"word": "強基金", "pinyin":"Qiáng jījīn", "translate":"Strong fund"}]

[{'word': '強基金', 'pinyin': 'Qiáng jījīn', 'translate': 'Strong fund'}]

"辛苦" is a Chinese word that you would use to express "hard work", "tiredness", or "exertion". It can also be used as a way to say "take care", "be well", or "have a good rest" when someone has been working hard.

**Pinyin:** Kǔ xīn

**Example usage:**
1. 你干了一天的活，辛苦了。 (Nǐ gàn yī tiān de huó, kǔ xīn le.) - You've worked all day, you must be tired.
2. 感谢你的辛苦工作。 (Gǎn xiè nǐ de kǔ xīn gōng zuò.) - Thank you for your hard work.

# LLM to generate the following

- meanings
- 5 simple example usages

## `openai` response
```json
[
  {
    "definitions": [
      "辛苦 is a Chinese word that you would use to express 'hard work', 'tiredness', or 'exertion'.",
      "辛苦 is also used as a way to say 'take care', 'be well', or 'have a good rest' when someone has been working hard."
    ],
    "example_usages": [
      "他每天工作很辛苦。",
      "她辛苦地照顾三个孩子。",
      "辛苦了，谢谢你的帮助。",
      "这项工作真是太辛苦了。",
      "尽管辛苦，他从不抱怨。"
    ]
  }
]
```

## Final `json`

```json
[
  {
    "word": "辛苦",
    "translation": "Hard",
    "pronounciation_audio_bytes": "RIFF~vx",
    "definitions": [
      "辛苦 is a Chinese word that you would use to express 'hard work', 'tiredness', or 'exertion'.",
      "辛苦 is also used as a way to say 'take care', 'be well', or 'have a good rest' when someone has been working hard."
    ],
    "example_usages": [
      {
        "chinese": "他每天工作很辛苦。",
        "pinyin": "Tā měitiān gōngzuò hěn xīnkǔ.",
        "english": "He works very hard every day.",
        "audio_bytes": "RIFF~vx"
      },
      {
        "chinese": "她辛苦地照顾三个孩子。",
        "pinyin": "Tā xīnkǔ de zhàogù sān gè háizi.",
        "english": "She worked hard to take care of her three children.",
        "audio_bytes": "RIFF~vx"
      },
      {
        "chinese": "辛苦了，谢谢你的帮助。",
        "pinyin": "Xīnkǔle, xièxiè nǐ de bāngzhù.",
        "english": "Thank you for your help.",
        "audio_bytes": "RIFF~vx"
      }
    ]
  }
]
```

In [6]:
client = OpenAI(api_key=st.secrets["OPENAI_API_KEY"])


In [22]:
chinese_word = "接機"
user_prompt = f"""
# Instuction
Generate a response in a structured JSON format that interprets the traditional Chinese word {chinese_word}. Include a list of definitions and a list of sentences as example usages. Ensure that:

# JSON Output Requirements
- The "definitions" key contains 1 to 3 entries that explain the word in English.
- The "example_usages" key contains exactly 3 sentences demonstrating the use of the word in traditional Chinese sentences.

# Example output
- Below is an example output for the traditional Chinese word "辛苦"

```json
[
  {{
    "definitions": [
      "辛苦 is a Chinese word that you would use to express 'hard work', 'tiredness', or 'exertion'.",
      "辛苦 is also used as a way to say 'take care', 'be well', or 'have a good rest' when someone has been working hard."
    ],
    "example_usages": [
      "他每天工作很辛苦。",
      "她辛苦地照顾三个孩子。",
      "辛苦了，谢谢你的帮助。",
    ]
  }}
]
```
"""

In [23]:

response = client.chat.completions.create(
  model="gpt-4o-mini",
  response_format={ "type": "json_object" },
  messages=[
    {"role": "system", "content": "You are a helpful assistant designed to output JSON."},
    {"role": "user", "content": user_prompt}
  ]
)
print(response.choices[0].message.content)

{
  "definitions": [
    "接機 refers to the act of picking someone up at the airport.",
    "接機 can also imply providing transportation for someone arriving on a flight.",
    "The term is often used in the context of greeting friends or family who arrive by plane."
  ],
  "example_usages": [
    "我今天要去接機，我的朋友從國外回來。",
    "她提前到了，準備在機場接機。",
    "他常常幫忙接機他的同事，讓他們感到受歡迎。"
  ]
}


In [26]:
response_string = response.choices[0].message.content

# Pydantic

Validate JSON output

In [43]:
class OpenAIValidator(BaseModel):
    definitions: List[str] = Field(min_items=1, max_items=3)
    example_usages: List[str] = Field(min_items=3, max_items=3)

In [45]:
mydata =   {
    "definitions": [
      "辛苦 is a Chinese word that you would use to express 'hard work', 'tiredness', or 'exertion'.",
      "辛苦 is also used as a way to say 'take care', 'be well', or 'have a good rest' when someone has been working hard."
    ],
    "example_usages": [
      "他每天工作很辛苦。",
      "她辛苦地照顾三个孩子。",
      "辛苦了，谢谢你的帮助。"
    ]
  }

try:
    #mytest = OpenAIValidator(**mydata)
    openai_json_validate = OpenAIValidator.model_validate_json(response_string)
except ValidationError as e:
    print(str(e))

['接機 refers to the act of picking someone up at the airport.',
 '接機 can also imply providing transportation for someone arriving on a flight.',
 'The term is often used in the context of greeting friends or family who arrive by plane.']

# Text to Speech

In [9]:
# Define the URL to your FastAPI endpoint
url = "http://localhost:80/synthesize"

# options of language and speaker
# {'EN': ['EN-US', 'EN-BR', 'EN_INDIA', 'EN-AU', 'EN-Default'],
#  'ES': ['ES'],
#  'FR': ['FR'],
#  'ZH': ['ZH'],
#  'JP': ['JP'],
#  'KR': ['KR']}

# The data to be sent in the POST request
data = {
    "language": "ZH",  # Replace with the language of your choice
    "speaker": "ZH",  # Replace with speaker of choice. English British accent.
    "text": "This is a test. Hello world!",
    "speed": 1.2, # must be a positive number
}

# Set the appropriate headers
headers = {
    "Content-Type": "application/json"
}

# Make the POST request
response = requests.post(url, data=json.dumps(data), headers=headers)