In [1]:
import openai

In [2]:
with open('api.key', 'r') as f:
    api_key = f.read().replace('\n', '')

In [3]:
import re
import json
import time


class GPTGenerater:
    
    def __init__(self, api_key, model="gpt-3.5-turbo", temperature=0.1):
        self.api_key = api_key
        openai.api_key = self.api_key
        self.model = model
        self.temperature = temperature
        
    def predict(self, prompt):
        response = openai.ChatCompletion.create(
            model=self.model,
            messages=[{"role": "assistant", "content": prompt}],
            temperature=self.temperature,
        )
        return response
    
    

JSON_RE = re.compile(r'(\{.*\})')

class JsonExtract:
    
    def reply2result(self, reply: str) -> dict:

        json_str = JSON_RE.findall(reply.replace('\n', ' '))[0]
        ret = json.loads(json_str)
        return ret

    def result2products(self, result: dict, target_key='tags') -> list[str]:
        assert target_key in result
        assert isinstance(result[target_key], list)

        ret = result[target_key]

        return ret

    def reply2products(self, reply:str) -> list[str]:
        
        result = self.reply2result(reply)
        ret = self.result2products(result)


        return ret

In [4]:
sample_generator = GPTGenerater(api_key)

In [1]:
instruction = '''
Assign appropriate labels/tags to the product as "tags", in flavor, brand, key ingredient, and package size 
(if applicable), etc.  

product: {product}

1. Return the results in JSON format with the following key: "tags".
2. Replied answer should be as diverse as possible.
3. Do not repeat answers.
4. Reply in Taiwan Chinese.
5. Please avoid choosing duplicate tags.
6. Your tags shall be no more than {max_tags}.

The json result is: 
'''


class SampleAugumentation(JsonExtract):
    
    def __init__(self, sample_generator, instruction):
        self.sample_generator = sample_generator
        self.instruction = instruction
        
    def _sample_generate(self, product:str, max_tags:int=10, debug=False):
        

        prompt = self.instruction.format(product=product, max_tags=max_tags)
        response = self.sample_generator.predict(prompt)
        
        content = response.to_dict()['choices'][0]['message']['content']
        
        ret = self.reply2result(content)
        
        return ret
    
    def sample_generate(self, product:str, max_tags:int=10):
        debug = False
        while True:
            try:
                return self._sample_generate(product=product, max_tags=max_tags, debug=debug)
            except Exception as e:
                debug = True
                print(e)
                time.sleep(1)

NameError: name 'JsonExtract' is not defined

In [None]:
sa = SampleAugumentation(sample_generator, instruction)

In [7]:
product = '統一精品手沖咖啡'
sa.sample_generate(product=product, max_tags=10)

{'tags': ['手沖咖啡', '統一', '精品', '咖啡', '飲品', '咖啡豆', '咖啡因', '香氣', '口感', '包裝']}

In [8]:
import pandas as pd

In [9]:
df_samples = pd.read_csv('data/samples.csv', index_col=0)
df_samples = df_samples[df_samples['item_name'].apply(lambda x: '外送費' not in x)]
df_samples = df_samples[df_samples['item_name'].apply(lambda x: '平台費' not in x)]

In [15]:
item = df_samples.sample(1).iloc[0]
item

item_name    "三得利微醉""白色沙瓦""350ml[350ML/罐] "
item_type                               菸酒類
category                                 啤酒
Name: 212255, dtype: object

In [16]:
sa.sample_generate(product=item['item_name'], max_tags=5)

{'tags': ['三得利', '微醉', '白色沙瓦', '350ml', '罐']}

In [18]:
sa.sample_generate(product=item['item_name'], max_tags=3)

{'tags': ['三得利', '微醉', '白色沙瓦']}

In [13]:
from collections import Counter
c = Counter()

In [None]:

for i, (_, x) in enumerate(df_samples.sample(1000).iterrows()):
    print(i, end='\r')
    try:
        tags = sa._sample_generate(product=x['item_name'], max_tags=3)
    except:
        pass
    c.update(tags['tags'])

In [86]:
with open('workflow_asset/tag_set_example.json', 'w') as f:
    f.write(json.dumps(c))