In [1]:
import json

In [2]:
with open('workflow_asset/tag_set_example.json', 'r') as f:
    c = json.loads(f.read())

In [3]:
tags = [k for k, v in c.items() if v > 1]
len(tags)

156

In [4]:
tags

['薄荷口味',
 '套餐',
 '茶',
 '無糖',
 '500ml',
 '辣椒',
 '醬油',
 '台灣味',
 '草莓',
 '零食',
 '牛肉',
 '小包裝',
 '巧克力',
 '300g',
 '家福',
 '盒裝',
 '品牌',
 '起司',
 '調味料',
 '牛奶',
 '信用卡',
 '卡片',
 '金融',
 '可愛',
 '咖啡',
 '咖啡因',
 '薄荷',
 '綠茶',
 '口感',
 '兒童',
 '黑色',
 '美味',
 '果汁',
 '水果味',
 '手機殼',
 '義大利',
 '甜品',
 '包裝',
 '美食',
 '黑巧克力',
 '獅王',
 '紅茶',
 '按摩',
 '支付',
 '2023-05-10',
 '玉米',
 '小說',
 '韓系',
 '簡約',
 '冰淇淋',
 '益生菌',
 '汽車配件',
 '電子書',
 '250ml',
 '日本',
 '光泉',
 '原味',
 '食品',
 '韓式',
 '免運',
 '奶油',
 '蘋果',
 '蔬菜',
 '健康飲食',
 '葡萄',
 '有機',
 '飲品',
 '可可',
 '口罩',
 '活性碳',
 '可可粉',
 '蕃茄醬',
 '女款',
 '甜點',
 '隱形眼鏡',
 '花生',
 '薄荷味',
 '蘋果口味',
 '500毫升',
 '蛋白質',
 '糖果',
 '蠟筆小新',
 '醋',
 '不鏽鋼',
 '修護',
 '拿鐵',
 '糕點',
 '韓國',
 '海鮮',
 '肉泥',
 'CIAO',
 '辣味',
 '辣椒醬',
 '保濕',
 '玻尿酸',
 '蜂蜜',
 '大包裝',
 '護膚品',
 '池上',
 '白蘭地',
 '瓶裝',
 '炸雞',
 '書籍',
 '內褲',
 '寬松',
 '焦糖',
 '醬料',
 '夾心',
 '蛋糕',
 '麻辣',
 '麥片',
 '3M',
 '醫用',
 '優惠',
 '便當',
 '五香',
 '超薄',
 '柔濕巾',
 '口味',
 '收納',
 '襪子',
 '果醬',
 '控油',
 '濃縮',
 '香蕉',
 '外送員小費',
 '烏龍茶',
 '美容',
 '香辣',
 '川味麻辣火鍋底料',
 '川味火鍋料理',
 '川味麻辣滷肉',
 

In [5]:
import openai

In [6]:
with open('api.key', 'r') as f:
    api_key = f.read().replace('\n', '')

In [7]:
import re
import json
import time


class GPTGenerater:
    
    def __init__(self, api_key, model="gpt-3.5-turbo", temperature=0.0):
        self.api_key = api_key
        openai.api_key = self.api_key
        self.model = model
        self.temperature = temperature
        
    def predict(self, prompt):
        response = openai.ChatCompletion.create(
            model=self.model,
            messages=[{"role": "assistant", "content": prompt}],
            temperature=self.temperature,
        )
        return response
    
    

JSON_RE = re.compile(r'(\{.*\})')

class JsonExtract:
    
    def reply2result(self, reply: str) -> dict:

        json_str = JSON_RE.findall(reply.replace('\n', ' '))[0]
        ret = json.loads(json_str)
        return ret

    def result2products(self, result: dict, target_key='tags') -> list[str]:
        assert target_key in result
        assert isinstance(result[target_key], list)

        ret = result[target_key]

        return ret

    def reply2products(self, reply:str) -> list[str]:
        
        result = self.reply2result(reply)
        ret = self.result2products(result)


        return ret

In [8]:
sample_generator = GPTGenerater(api_key)

In [9]:
instruction = '''
Select tags to the product as "tags", out from valid set.

product: {product}
valid set: {valid_tags}

1. Return the results in JSON format with the following key: "tags".
2. Do not repeat answers.
3. Reply in Taiwan Chinese.
4. Please avoid choosing duplicate tags.
5. Your number of tags shall be no more than {max_tags}.
6. Do not apply when tag is not fit. Tags can also be empty.
7. The tags returned must strongly fit to {product}

The json result is: 
'''


class SampleAugumentation(JsonExtract):
    
    def __init__(self, sample_generator, instruction):
        self.sample_generator = sample_generator
        self.instruction = instruction
        
    def _sample_generate(self, product:str, valid_tags:list, max_tags:int=10, debug=False):
        

        prompt = self.instruction.format(product=product, valid_tags=valid_tags, max_tags=max_tags)
        response = self.sample_generator.predict(prompt)
        
        content = response.to_dict()['choices'][0]['message']['content']
        
        ret = self.reply2result(content)
        
        # print(ret['tags'])
        ret['tags'] = list(filter(lambda x: x in c, ret['tags']))
        
        return ret
    
    def sample_generate(self, product:str, valid_tags:list, max_tags:int=10):
        debug = False
        while True:
            try:
                return self._sample_generate(product=product, valid_tags=valid_tags, max_tags=max_tags, debug=debug)
            except Exception as e:
                debug = True
                print(e)
                time.sleep(1)

In [10]:
sa = SampleAugumentation(sample_generator, instruction)

In [11]:
product = '統一精品手沖咖啡'
sa._sample_generate(product=product, valid_tags=tags, max_tags=4)

{'tags': ['咖啡']}

In [12]:
import pandas as pd
df = pd.read_csv('data/samples.csv', index_col=0)
df = df[df['item_name'].apply(lambda x: '外送費' not in x)]
df = df[df['item_name'].apply(lambda x: '平台費' not in x)]

In [13]:
for i in df['item_name'].sample(100):
    print(i)
    print(sa._sample_generate(product=i, valid_tags=tags, max_tags=4))

【113F 03006007】車用液化石油氣 
{'tags': ['汽車配件']}
捷康人氣紅燒牛肉麵/包(680G/包)【愛買冷凍】 
{'tags': ['牛肉', '冷凍']}
炎記韓式腐乳杏鮑菇(葷食) 
{'tags': ['韓式']}
送好禮 怡寶角落小夥伴超輕量護脊書包 IMSG601PK 
{'tags': []}
4573449105941【bande】日本貼紙型和紙膠帶  
{'tags': ['日本']}
料都亭九州醬油洋芋片 
{'tags': ['零食', '洋芋片', '醬油']}
💥【 LED 銅線燈 】台灣現貨 燈串 艾妮EasyPar 
{'tags': ['台灣現貨']}
大容量旅行包 50克DOOOG 杜格無榖貓糧 田園什錦 杜革 
{'tags': []}
2023052174097375210 
{'tags': ['茶', '零食', '包裝', '食品']}
【mamaway 媽媽餵】竹炭全彈力按摩束腹帶 
{'tags': ['按摩']}
PowerRider L5 USB語音聲控變色燈 白色 
{'tags': ['白色']}
花生牛肉堡單人豪華餐  2023-05-07 e4pf-5v 
{'tags': ['套餐', '牛肉', '美食', '包裝']}
【CS22】3D立體可拆洗車用記憶棉頸靠枕(汽車靠枕) 
{'tags': ['汽車配件', '立體']}
【TP-Link】Deco X95 AX7800 三頻 AI 
{'tags': []}
榛果風味那堤 2023-05-13 y9i0-6kyx 
{'tags': ['咖啡', '榛果']}
【喜番屋】真皮頭層油蠟牛皮3隔層口金包零錢包【LH941】 
{'tags': []}
【歐奇斯】95%有機狗飼料-幼犬-4LB/1.81KG 
{'tags': ['有機']}
🔥正品現貨出清🔥 Real Techniques 美妝蛋 
{'tags': ['現貨']}
氧視加 去蛋白雙氧藥水 適合各種軟、硬式隱形眼鏡 無防腐劑  
{'tags': ['隱形眼鏡']}
雙桔紅茶 
{'tags': ['茶', '無糖', '果汁', '水果味']}
X-FREE 700*18/25C【A/V】美嘴 48mm  
{'tags': []}
60秒速效精華水面膜 