In [1]:
from tqdm import tqdm

import pandas as pd
pd.set_option('mode.chained_assignment',  None)

import pymysql, os

from openai import OpenAI

from sklearn.model_selection import train_test_split

def live_db_conn():
    conn = pymysql.connect(host='host', user='user', password='password',autocommit=True,cursorclass=pymysql.cursors.DictCursor, db = "db")
    return conn

In [30]:
r_food_d = pd.read_csv('./input/EN_Final_Aug_df.csv',index_col=0)

r_food_d.head()

Unnamed: 0,SPEAKER,SENTENCE,개체명,지식베이스,intent_cat
0,고객,"It's SongbaekdwaejiGukbap, right? I am plannin...","SongbaekdwaejiGukbap, 열 명, 예약, 언제, 전화","SongbaekdwaejiGukbap|상호, ten people|인원",예약_문의
1,고객,"It's NiwaBakery, right? Around March 25th, the...","NiwaBakery, 삼월 이십오일, 모임, 안쪽, 예약, 언제","NiwaBakery|상호, March 25th|예약일, inside|위치",예약_문의
2,고객,"It's CheonhaDonkkaseu, right? Do we need to ma...","CheonhaDonkkaseu, 다섯 명, 예약","CheonhaDonkkaseu|상호, five people|인원",예약_문의
3,고객,"It's Eunjiyakguk, right? If about ten people a...","Eunjiyakguk, 열 명, 회식, 미리, 연락","Eunjiyakguk|상호, company dinner|행사, ten people|인원",예약_문의
4,고객,"It's Hanoimaekjubamgeoriguwoljeom, right? Can ...","Hanoimaekjubamgeoriguwoljeom, 당일 낮","Hanoimaekjubamgeoriguwoljeom|상호, on the same d...",예약_문의


In [1]:
# RT API GPT4
%env OPENAI_API_KEY=OPENAI_API_KEY

open_api_key = os.getenv("OPENAI_API_KEY")

client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get('OPENAI_API_KEY'),
)

In [21]:
cn_df = pd.DataFrame(columns=r_food_d.columns)
error_idx_list = []
idx = 0

for rows in tqdm(r_food_d.values):

    try:

        sentence = rows[1]
        entity_name = rows[2]
        knowledge_base = rows[3]
        knowledge_base_list = knowledge_base.split(', ')
        real_name_list = [kb.split('|')[0] for kb in knowledge_base_list]
        
        intent_cat = rows[4]

        chat_completion = client.chat.completions.create(
            messages=[
                {"role": "user",
                "content": """
                            1단계 - 이중 따옴표로 구분된 제공된 Sentence에 해당하는 영어 문장을 중국어(간체)로 전체 번역하라.
                            "Sentence": ""{}"" 

                            2단계 - 이중 따옴표로 구분된 제공된 Knowledge 영어 단어들을 Sentence 번역에 사용된 동일한 중국어(간체)를 사용해서 번역하라.
                            Knowledge는 따움표 안에 있는 것은 하나의 단어이므로 하나의 세트로 번역하라.
                            Knowledge: ""{}"".
                        
                            번역 결과는 "Sentence: , 
                            Knowledge: 'Knowledge1(영어)->Knowledge1(중국어)', 'Knowledge2(영어)->Knowledge2(중국어)'" 형태로 내보내라.

                            Input 예시는 아래와 같아.
                            Sentence: ""It's SongbaekdwaejiGukbap, right? I am planning to make a reservation for about ten people. When should I call in advance to let you know?"",
                            Knowledge: ""'SongbaekdwaejiGukbap', 'ten people'""
                            
                            Output의 형식은 Example을 참고하라.
                            Example: ""
                            Sentence: '是松柏汤饭吧？我打算预订十人左右。我什么时候应该提前打电话通知您？',
                            Knowledge: 'SongbaekdwaejiGukbap->松柏汤饭', 'ten people->十人' 
                            
                            최종적으로 Output 내용만 답변에 넣어라
                            """.format(sentence, real_name_list)}
            ],
            model="gpt-3.5-turbo"
            )
        
        answer = chat_completion.choices[0].message.content
        answer = answer.replace('(영어)','').replace('(일본어)','')

        if 'Input' in answer:

            answer = answer.split('Output:\n')
      
        translation = answer.split('Sentence: ')[1].split(',\n')[0].strip("'")
        translation = translation.split('\nKnowledge')[0].strip('"')

        translated_kb = answer.split('Knowledge: ')[1].split(', ')
        translated_kb = answer.split('Knowledge: ')[1].replace(']','').replace('[','').strip("'").split(', ')

        kb_dic = {}

        for _ in translated_kb:

            kv = _.split('->')
            key = kv[0].strip("'")
            key = key.rstrip()
            value = kv[1].strip("'")
            value = value.replace('\n','')
            value = value.replace('"','')
            value = value.replace("'",'')
            value = value.strip()

            kb_dic[key] = value

        for _ in knowledge_base_list:

            real_name = _.split('|')[0]

            if real_name in ['#이름#','#주소#']:

                continue

            if real_name in kb_dic.keys():
                
                knowledge_base = knowledge_base.replace(real_name, kb_dic[real_name])
        
        # knowledge_base = knowledge_base.replace('\'\n                            ""', '')
        # knowledge_base = knowledge_base.replace('\'\n                            ""', '')

        new_row = pd.DataFrame([['고객', translation, entity_name, knowledge_base, intent_cat]], columns=r_food_d.columns)
        cn_df = pd.concat([cn_df,new_row], axis=0)
        idx += 1

    except:

        error_idx_list.append(idx)
        idx += 1

100%|██████████| 4136/4136 [00:10<00:00, 379.92it/s]


In [15]:
cn_df['지식베이스'] =  cn_df['지식베이스'].apply(lambda x:x.replace('\n',''))
cn_df['지식베이스'] =  cn_df['지식베이스'].apply(lambda x:x.replace('"',''))
cn_df['지식베이스'] =  cn_df['지식베이스'].apply(lambda x:x.replace("'",''))

In [16]:
cn_df['SENTENCE'] = cn_df['SENTENCE'].apply(lambda x:x.replace('\n',''))
cn_df['SENTENCE'] = cn_df['SENTENCE'].apply(lambda x:x.replace('"',''))
cn_df['SENTENCE'] = cn_df['SENTENCE'].apply(lambda x:x.replace("'",''))
cn_df['SENTENCE'] = cn_df['SENTENCE'].apply(lambda x:x.split('Knowledge')[0].rstrip())

In [17]:
cn_df.head()

Unnamed: 0,SPEAKER,SENTENCE,개체명,지식베이스,intent_cat
0,고객,是豁里吗？我需要提前预订才能去玩滑雪梦社长吗？,"Hoeori, 동태탕, 예약","豁里|상호, 滑雪梦社长|메뉴",예약_문의
0,고객,是李比奥酒店韩食堂吗？您可以接受炸鸭扣的团体订单吗？,"RiberoHotelHansikDang, 김밥, 단체, 주문","李比奥酒店韩食堂|상호, 炸鸭扣|메뉴",예약_문의
0,고객,是Inoheeosenseu吗？我前一天需要订购大约三十卷的FrenchStrawberri...,"Inoheeosenseu, 김밥, 이삼십 줄, 정도, 주문, 전날 주문","Inoheeosenseu|상호, FrenchStrawberriesMacaron|메뉴...",예약_문의
0,고객,是公内供应达特凯欧姆，对吧？我想要一个木萨巴尔Set2in。我何时应该为山上的野餐下订单？,"Gungnakgongyeondatkeom, 일반 김밥, 산, 주문","公内供应达特凯欧姆|상호, 木萨巴尔Set2in|메뉴",예약_문의
0,고객,"是RuKissAteu吗？在这家中餐馆里无限供应的肉是4,000韩元吗？","RuKissAteu, 짜장면, 홀, 사천 원","RuKissAteu|상호, 无限供应的肉|메뉴, 中餐馆|장소, 4,000韩元|금액",매장_배달


In [45]:
cn_df.to_csv('CN_FInal_Aug_df.csv')