# Import

In [1]:
import os

os.chdir(os.path.dirname(os.getcwd()))

In [2]:
import json
from tqdm import tqdm
from collections import defaultdict

import boto3
import numpy as np
from sklearn.metrics import confusion_matrix


from jovis_model.config import Config
from run import ModelRunner

# Bedrock

In [7]:
params = {
    "pkg": "llm",
    "task": "bedrock",
    "use_hf_model": False,
    "params": {
        "m_type_id": "anthropic.claude-3-haiku-20240307-v1:0",
        "max_new_tokens": 400,
    }
}
config = Config(**params)
runner = ModelRunner(
    config=config,
    mode="inference"
)

04:45:34:PM:INFO: data module loaded
04:45:34:PM:INFO: model module loaded


# Keyword Extractor

### Testset generation

In [88]:
gen_prompt = """
You are an expert to generate utterances which can be used as an input to a e-commerce chatbot system.
Generally, these utterances are related to fashion items worn by celebrities or actors in tv shows.
Your task is to generate utterances using some keywords.

Here is the list of keywords:
<keyword>
1. target-item: product categories the user is looking for. Available high-level categories are accessories, bags, belts, glasses, gloves, har_accessories, hats, jewelries, key_ring, outwears, pants, scarf/muffler, shoes, skirts, socks, swimwears, tie, tops, watches, wholebodies.
2. target-attribute: product attributes of the target item. Available high-level attributes are color, material, print, neckline, length, season, place, occasion, style.
3. program-name: name of the TV show
4. actor-name: name of the actor or actress
5. brand: brand of the product. (e.g., Gucci, Nike, etc.)
6. price: price or price range
</keyword>

<scratchpad>
Before generating utterance, take a moment to select keywords in high-level and those values.
</scratchpad>

Here is the output format you should consider.
<output_format>
Provide your output in the following JSON format only. Enclose your JSON output with `<output>` and `</output>`:
<output>
{
    "utterance": <value>,
    "used_keywords": {
        "target-item": {
            <high-level>: <value>,
            <high-level>: <value>,
        }
        "target-attribute": {
            <high-level>: <value>,
            <high-level>: <value>,
        }
        "program-name": <value>,
        "actor-name": <value>,
        "brand": <value>,
        "price": <value>
    }
}
</output>
There's some rule:
1. ONLY one utterance
2. NO KEYWORD informations in utterance.
3. There are multiple options for target-item and target-attribute.
4. For unused keywords, <value> MUST be "NONE".
5. All the <value> MUST be in Korean.
</output_format>
"""

In [107]:
dialogs = [
    {"role": "user", "content": gen_prompt}
]

In [108]:
res = runner.run(sample_inputs=dialogs)

In [109]:
print(res)

<output>
{
    "utterance": "모니카 벨루치가 'Friends' 에서 입고 있던 그 갈색 가죽 숄더백은 브랜드 'Gucci'의 제품이었어요. 가격은 보통 200만원대 정도로 알고 있습니다.",
    "used_keywords": {
        "target-item": {
            "bags": "숄더백"
        },
        "target-attribute": {
            "color": "갈색",
            "material": "가죽"
        },
        "program-name": "Friends",
        "actor-name": "모니카 벨루치",
        "brand": "Gucci",
        "price": "약 200만원대"
    }
}
</output>


### Evaluation

In [None]:
test_prompt = """
You are an agent for an e-commerce chatbot system. Your task is to extract relevant keywords from the user's query and chat history to help the specialized agents search for appropriate products.

Here is the chat history:
<chat_history>
{{CHAT_HISTORY}}
</chat_history>

The current user query is:
<current_user_query>
{{REFINED_QUERY}}
</current_user_query>

Today's date is:
<today_date>
{{TODAY_DATE}}
</today_date>

<keyword_extraction>
Next, extract the following relevant keywords from the user's query:
1. target-item: the item the user is looking for (e.g., t-shirt, jeans, shoes). target-item is a category of clothing, shoes, bags, etc., and brands or other nouns are not target items. If you find multiple "target-item" make sure to fill in all the additional information for each one.
2. target-attribute: attributes of the target item (e.g., color, material, print, neckline, length, occasion, mood, style)
3. program-name: name of the TV show
4. actor-name: name of the actor or actress (e.g 오연서, 김종민)
5. looking-for-popular: This value is True or False (boolean). For example, when a user asks about popular items, such as what are trend t-shirts these days?, this value is True.
6. brand: brand of the product. (e.g., Gucci, Nike, etc.)
7. price: price or price range the user asked for. the format should be <min_price>-<max_price> (e.g., below 1000 krw: -1000 / above 10000krw: 10000- / around 100000: 50000-150000)
8. time: time range of tv show. given date of <today_date>, extract asked time period in the format of YYYYMMDD-YYYYMMDD
Each keyword should have an empty string value if there is no corresponding information in the user's query. Do not make things up.
</keyword_extraction>

<output_format>
Provide your output in the following JSON format only. Enclose your JSON output with `<output>` and `</output>`:
<output>{"role": "keyword_extractor", "target-item": [<item1>, <item2>], "target-attribute": {<item1>: {"looking-for-popular": <value>, "program-name": <value>, "actor-name": <value>, "brand": <value>, "price": <value>, "time": <value>, <attribute1>: <value>, <attribute2>:<value>, ...}, <item2> : {"looking-for-popular": <value>, "program-name": <value>, "actor-name": <value>, "brand": <value>, "price": <value>, "time": <value>, <attribute1>: <value>, <attribute2>:<value>, ...}}}</output>
</output_format>
"""