# Libraries

In [2]:
import pandas as pd
import requests
import json
from typing import List, Optional
from pydantic import BaseModel, Field, ValidationError
from glob import glob

from retry import retry
import streamlit as st
import IPython
from tqdm import tqdm
import base64

from openai import OpenAI
client = OpenAI(api_key=st.secrets["OPENAI_API_KEY"])
OPENAI_MODEL = "gpt-4o-mini"

In [21]:
df = pd.read_csv("../data/processed/chinese_sentences.csv")
df['sentences'] = df['sentences'].apply(lambda x: "".join(x.split()))
sentences_dict = df.to_dict(orient="records")

In [22]:
sentences_dict[:3]

[{'chapter': 1, 'sentences': '我姓張，是騰飛公司的業務部經理。'},
 {'chapter': 1, 'sentences': '我的名字叫張宏岳，宏是宏大的宏，岳是岳飛的岳。'},
 {'chapter': 1, 'sentences': '我叫白可凡，可是可以的可，凡是平凡的凡。我的祖籍在福建廈門，但是在廣州出生。'}]

# Steps

for each word:
- get the openai inputs
- get the google translate inputs
- get the TTS inputs


```json
[
  {
    "chapter": 1,
    "word": "謝謝大家的支持和配合",
    "pinyin": "Xièxiè dàjiā de zhīchí hé pèihé",
    "translation": "Thank you for your support and cooperation",
    "pronounciation_audio_bytes": "RIFF~vx"
  }
]
```

In [15]:
@retry(requests.exceptions.HTTPError, tries=5, delay=10)
def translate(text, from_lang, to_lang):
    # Base URL of the translation API
    url = "https://api.datpmt.com/api/v1/dictionary/translate"

    # Parameters for the GET request
    params = {
        'string': text,
        'from_lang': from_lang,
        'to_lang': to_lang
    }

    # Make the GET request to the translation API
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        json_response = response.json()
        return json_response # Adjust the key based on actual API response structure
    else:
        #return "Error: " + response.text
        print("Error Translation: " + response.text)
        raise requests.exceptions.HTTPError

In [16]:
@retry(requests.exceptions.HTTPError, tries=5, delay=10)
def transliterate(keyword, from_lang):
    # Base URL of the transliteration API
    url = "https://api.datpmt.com/api/v1/dictionary/transliteration"

    # Parameters for the GET request
    params = {
        'keyword': keyword,
        'from_lang': from_lang
    }

    # Make the GET request to the transliteration API
    response = requests.get(url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse JSON response
        json_response = response.json()
        return json_response  # Adjust this key based on the actual API response structure
    else:
        print("Error Transliteration: " + response.text)
        raise requests.exceptions.HTTPError
        #return "Error: " + response.text

In [17]:
def text_to_speech(chinese_sentence):
    # Define the URL to your FastAPI endpoint
    url = "http://localhost:80/synthesize"

    # The data to be sent in the POST request
    data = {
        "language": "ZH",  # Replace with the language of your choice
        "speaker": "ZH",  # Replace with speaker of choice. English British accent.
        "text": chinese_sentence,
        "speed": 0.7, # must be a positive number
    }

    # Set the appropriate headers
    headers = {
        "Content-Type": "application/json"
    }
    # Make the POST request
    response = requests.post(url, data=json.dumps(data), headers=headers)
    return response.content

# Workflow

In [18]:
sentences_dict[0]

{'chapter': 1, 'sentences': '我姓張，是騰飛公司的業務部經理。'}

In [23]:
for sentence in tqdm(sentences_dict):
    print(f"processing sentence {sentence}")
    sentence["pinyin"] = transliterate(sentence["sentences"], from_lang="zh-TW")
    sentence["translation"] = translate(sentence["sentences"], from_lang="zh-TW", to_lang="en")
    sentence["pronounciation_audio_bytes"] = base64.b64encode(text_to_speech(sentence["sentences"])).decode('utf-8')

  0%|          | 0/33 [00:00<?, ?it/s]

processing sentence {'chapter': 1, 'sentences': '我姓張，是騰飛公司的業務部經理。'}


  3%|▎         | 1/33 [00:00<00:22,  1.39it/s]

processing sentence {'chapter': 1, 'sentences': '我的名字叫張宏岳，宏是宏大的宏，岳是岳飛的岳。'}


  6%|▌         | 2/33 [00:01<00:24,  1.24it/s]

processing sentence {'chapter': 1, 'sentences': '我叫白可凡，可是可以的可，凡是平凡的凡。我的祖籍在福建廈門，但是在廣州出生。'}


  9%|▉         | 3/33 [00:02<00:28,  1.04it/s]

processing sentence {'chapter': 1, 'sentences': '我姓李，叫傑克，泰國人。現在在澳洲銀行工作。'}


 12%|█▏        | 4/33 [00:03<00:26,  1.11it/s]

processing sentence {'chapter': 1, 'sentences': '我今年28歲，祖籍山西大同，畢業於南京大學金融系投資與理財專業，獲得了碩士學位。'}


 15%|█▌        | 5/33 [00:04<00:27,  1.00it/s]

processing sentence {'chapter': 1, 'sentences': '很高興以後能和大家一起工作！希望大家多多指教。'}


 18%|█▊        | 6/33 [00:05<00:25,  1.06it/s]

processing sentence {'chapter': 2, 'sentences': '很高興這次能來北京總部參加新產品的研發會。'}


 21%|██        | 7/33 [00:06<00:21,  1.19it/s]

processing sentence {'chapter': 2, 'sentences': '我來為您分別介紹一下北京總部產品設計部的同事。'}


 24%|██▍       | 8/33 [00:07<00:21,  1.16it/s]

processing sentence {'chapter': 2, 'sentences': '我早就聽說過你了，你參與設計的產品都很暢銷啊！'}


 27%|██▋       | 9/33 [00:07<00:19,  1.21it/s]

processing sentence {'chapter': 2, 'sentences': '此次會議的議題是商議我們公司今年新開發的玩具在內地及海外市場的推廣計劃。'}


 30%|███       | 10/33 [00:08<00:19,  1.19it/s]

processing sentence {'chapter': 2, 'sentences': '那我們兩地的銷售團隊怎樣分工進行推廣？'}


 33%|███▎      | 11/33 [00:09<00:17,  1.26it/s]

processing sentence {'chapter': 3, 'sentences': '我們銷售部也已經把培訓任務佈置下去了。'}


 36%|███▋      | 12/33 [00:09<00:15,  1.37it/s]

processing sentence {'chapter': 3, 'sentences': '謝謝大家的支持和配合。'}


 39%|███▉      | 13/33 [00:10<00:13,  1.46it/s]

processing sentence {'chapter': 3, 'sentences': '說回正經的。'}


 42%|████▏     | 14/33 [00:11<00:12,  1.51it/s]

processing sentence {'chapter': 3, 'sentences': '那我們今天的會議就到這裡，有事我們及時溝通。'}


 45%|████▌     | 15/33 [00:11<00:12,  1.41it/s]

processing sentence {'chapter': 3, 'sentences': '首先要表揚一下市場開發部。'}


 48%|████▊     | 16/33 [00:12<00:11,  1.47it/s]

processing sentence {'chapter': 3, 'sentences': '你們的銷售工作跟進得不夠積極，你們有什麼解釋嗎？'}


 52%|█████▏    | 17/33 [00:13<00:11,  1.37it/s]

processing sentence {'chapter': 3, 'sentences': '我們也被搞得很被動。'}


 55%|█████▍    | 18/33 [00:14<00:10,  1.45it/s]

processing sentence {'chapter': 3, 'sentences': '我們也面臨同樣的問題。'}


 58%|█████▊    | 19/33 [00:14<00:09,  1.54it/s]

processing sentence {'chapter': 3, 'sentences': '你們的困難我都明白，但今天坐在這裡就是為了一起想辦法。'}


 61%|██████    | 20/33 [00:15<00:08,  1.54it/s]

processing sentence {'chapter': 3, 'sentences': '那就這麼決定了。'}


 64%|██████▎   | 21/33 [00:15<00:07,  1.61it/s]

processing sentence {'chapter': 3, 'sentences': '進一步擴大我們的業務，爭取把太原、青島和鄭州拿下來。'}


 67%|██████▋   | 22/33 [00:16<00:07,  1.49it/s]

processing sentence {'chapter': 3, 'sentences': '有一些具體事情我還想向您請教一下。'}


 70%|██████▉   | 23/33 [00:17<00:06,  1.51it/s]

processing sentence {'chapter': 3, 'sentences': '你們的籌備工作進行的怎麼樣了？'}


 73%|███████▎  | 24/33 [00:17<00:05,  1.55it/s]

processing sentence {'chapter': 3, 'sentences': '你說這話就見外了。'}


 76%|███████▌  | 25/33 [00:18<00:05,  1.55it/s]

processing sentence {'chapter': 3, 'sentences': '我們還要多向你們學習呢！'}


 79%|███████▉  | 26/33 [00:20<00:06,  1.08it/s]

processing sentence {'chapter': 4, 'sentences': '我們部門正面臨著巨大的挑戰與壓力。'}


 82%|████████▏ | 27/33 [00:20<00:05,  1.17it/s]

processing sentence {'chapter': 4, 'sentences': '我們的市場份額已經被他們搶去了不少。'}


 85%|████████▍ | 28/33 [00:21<00:03,  1.26it/s]

processing sentence {'chapter': 4, 'sentences': '在接下來的工作中，還要依靠在坐各位的智慧及努力。'}


 88%|████████▊ | 29/33 [00:22<00:02,  1.35it/s]

processing sentence {'chapter': 4, 'sentences': '今天的會就到此結束，散會吧。'}


 91%|█████████ | 30/33 [00:22<00:02,  1.40it/s]

processing sentence {'chapter': 4, 'sentences': '除了張經理以外，所有人員都到了。'}


 94%|█████████▍| 31/33 [00:23<00:01,  1.44it/s]

processing sentence {'chapter': 4, 'sentences': '今天會議的議題是討論收購滬星公司的可行性。'}


 97%|█████████▋| 32/33 [00:23<00:00,  1.50it/s]

processing sentence {'chapter': 4, 'sentences': '我們下週再開會討論吧。'}


100%|██████████| 33/33 [00:24<00:00,  1.35it/s]


In [24]:
# Convert and write JSON object to file
with open(f"../data/output/sentences/sentences.json", "w") as outfile: 
    json.dump(sentences_dict, outfile)
# print(f"successfully output .json for {chinese_word}")
# #break

In [25]:
sentences_dict[1]

{'chapter': 1,
 'sentences': '我的名字叫張宏岳，宏是宏大的宏，岳是岳飛的岳。',
 'pinyin': 'Wǒ de míngzì jiào zhānghóngyuè, hóng shì hóngdà de hóng, yuè shì yuèfēi de yuè.',
 'translation': 'My name is Zhang Hongyue, Hong means Hongda Hong, and Yue means Yue Fei.',
 'pronounciation_audio_bytes': 'UklGRlydCABXQVZFZm10IBAAAAABAAEARKwAAIhYAQACABAAZGF0YTidCAD//wAAAAAAAAAA//////////8AAAAAAAAAAP//AAAAAAAA//////////////////8AAP//////////////////////////////////////////////////////////////////////////////////AAD///////8AAP////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# Pydantic Consumption for App

In [3]:
with open("../data/output/sentences/sentences.json") as json_file:
        sentences_dict = json.load(json_file)

In [4]:
len(sentences_dict)

33

In [5]:
class SentencesEntry(BaseModel):
    chapter: int
    sentences:str
    pinyin: str
    translation: str
    pronounciation_audio_bytes: str

# Declaring the full list type that holds multiple word entries.
class SentDictionaryEntries(BaseModel):
    entries: List[SentencesEntry]

In [10]:
# Deserializing the JSON data into Pydantic models
chinese_sentences = SentDictionaryEntries(entries=[SentencesEntry(**entry) for entry in sentences_dict]).entries

In [None]:
chinese_sentences[0]

In [12]:
chapter_to_query = 1
chapter_1 = list(filter(lambda sentence: sentence.chapter == 1, chinese_sentences))


In [32]:
IPython.display.Audio(base64.b64decode(chinese_sentences.entries[0].pronounciation_audio_bytes.encode('utf-8')))

# Consolidating json

In [40]:
compiled_keywords = []
for json_word in tqdm(glob("../data/output/keywords/**.json")):
    # Opening JSON file
    with open(json_word) as json_file:
        compiled_keywords.append(json.load(json_file))


100%|██████████| 209/209 [00:03<00:00, 67.41it/s]


In [89]:
# Convert and write JSON object to file
with open(f"../data/output_compiled/keywords/keywords.json", "w") as outfile: 
    json.dump(loaded_keywords, outfile)