## Product category classification

Read the categories from the data source

In [1]:
import pandas as pd

categories = pd.read_excel('分類與貼標優化需求文件.xlsx', sheet_name="大中分類類別清單")
categories

Unnamed: 0,大分類,Unnamed: 1,中分類,Unnamed: 3,說明
0,1,咖啡類,1,現做咖啡飲品,
1,1,咖啡類,2,罐裝/瓶裝咖啡,
2,1,咖啡類,3,沖泡式咖啡,
3,1,咖啡類,4,濾掛式咖啡,
4,1,咖啡類,5,即溶咖啡,
...,...,...,...,...,...
171,25,菸酒類,172,紅白酒,
172,25,菸酒類,173,香檳氣泡酒,
173,25,菸酒類,174,藥酒,
174,25,菸酒類,175,高粱,


In [2]:
categories = categories.rename(columns={"Unnamed: 1": "level_1","Unnamed: 3": "level_2"})
categories

Unnamed: 0,大分類,level_1,中分類,level_2,說明
0,1,咖啡類,1,現做咖啡飲品,
1,1,咖啡類,2,罐裝/瓶裝咖啡,
2,1,咖啡類,3,沖泡式咖啡,
3,1,咖啡類,4,濾掛式咖啡,
4,1,咖啡類,5,即溶咖啡,
...,...,...,...,...,...
171,25,菸酒類,172,紅白酒,
172,25,菸酒類,173,香檳氣泡酒,
173,25,菸酒類,174,藥酒,
174,25,菸酒類,175,高粱,


In [3]:
level_1 = categories['level_1'].unique().tolist()
level_1

['咖啡類',
 '飲料類',
 '水類',
 '乳品類',
 '豆米漿/植物奶',
 '冰品類',
 '鮮食類',
 '烘焙/甜點類',
 '蛋品類',
 '生鮮食品類',
 '冷凍食品類',
 '保健食品類',
 '點心/零食類',
 '民生食材類',
 '日用百貨類',
 '洗浴清潔/保養類',
 '寵物用品類',
 '戶外休閒用品類',
 '家電類',
 '書本/文教用品類',
 '3C類',
 '服飾鞋包類',
 '精品類',
 '傢俱/家飾類',
 '菸酒類',
 '禮盒類']

## Classification with zero shot

Zero-shot learning is often used in natural language processing (NLP) tasks such as text classification, where the goal is to classify text into a set of predefined categories. 

Here we choose one zeroshot model as an example

In [20]:
model_name = "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli"

In [21]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker

role = sagemaker.get_execution_role()
# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':model_name,
	'HF_TASK':'zero-shot-classification'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.26.0',
	pytorch_version='1.13.1',
	py_version='py39',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
zero_predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.m5.xlarge' # ec2 instance type
)


-----!

In [22]:
zero_predictor.endpoint_name

'huggingface-pytorch-inference-2023-05-03-01-50-45-347'

In [25]:
import boto3
import json
sagemaker_runtime = boto3.client("sagemaker-runtime")

In [28]:
def send_request(input_data):
    payload = json.dumps(input_data)
    response = sagemaker_runtime.invoke_endpoint(
        EndpointName=zero_predictor.endpoint_name, 
        ContentType="application/json", 
        Body=payload)

    # Process the response from the endpoint
    output_data = json.loads(response["Body"].read().decode())
    return output_data

In [29]:
data ={
    'inputs': "員-麻辣牛肉鍋",
    "parameters":{'candidate_labels' : ['服飾鞋包類', '洗浴清潔/保養類', '保健食品類', 'OTHER']}
}

send_request(data)

{'sequence': '員-麻辣牛肉鍋',
 'labels': ['保健食品類', 'OTHER', '洗浴清潔/保養類', '服飾鞋包類'],
 'scores': [0.604810357093811,
  0.28858956694602966,
  0.08328559249639511,
  0.02331451140344143]}

In [None]:
exam_data = pd.read_csv('item_list_onemonth.csv', delimiter='|')
exam_data

In [None]:
data = []

for index, row in exam_data.iterrows():
    print(index, row['item_name'],row['item_type'],row['category'])
    request_data = {
         'inputs': row['item_name'],
         "parameters":{'candidate_labels' : level_1+['Other']}
    }
    rlt = send_request(request_data)
    model_level_1 = rlt['labels'][0]
    level_2_category = categories[categories['level_1'] == model_level_1]['level_2'].to_list()
    print(f"predicted item type: {model_level_1,rlt['scores'][0]}")
    request_data = {
         'inputs': row['item_name'],
         "parameters":{'candidate_labels' : level_2_category+['Other']}
    }
    rlt_2 = send_request(request_data)
    print(f"predicted item category: {rlt_2['labels'][0],rlt_2['scores'][0]}")
    data.append({
        "item_name": row['item_name'],
        "item_type": row['item_type'],
        "category": row['category'],
        "model_item_type": model_level_1,
        "model_item_type_score": rlt['scores'][0],
        "model_category": rlt_2['labels'][0],
        "model_category_score": rlt_2['scores'][0],
    })
    
    if index> 100:
        break

In [None]:
df = pd.DataFrame(data)

# Write the DataFrame to a CSV file
df.to_csv("sample_result_zero.csv", index=False)

In [17]:
compare_list = ["伊藤園綠茶530ml", 
"光泉蘋果牛乳(小", 
"推-福樂麥芽牛乳",
"椰妹子泰式奶茶",  
"三十岬梅舖烏龍梅漬",  
"森之果物嚴選天然甘栗",  
"撕吧水蜜桃味軟糖32‧9g",  
"KINYO-多功能電烤盤",  
"代銷–綠豆沙牛乳", 
"波蜜蔬果汁低卡",  
"統一麵包巧克力派司", 
"香蒜丹麥麵包"]

In [None]:
for item in compare_list:
    request_data = {
         'inputs': item,
         "parameters":{'candidate_labels' : level_1+['Other']}
    }
    rlt = send_request(request_data)
    model_level_1 = rlt['labels'][0]
    level_2_category = categories[categories['level_1'] == model_level_1]['level_2'].to_list()
    
    request_data = {
         'inputs': row['item_name'],
         "parameters":{'candidate_labels' : level_2_category+['Other']}
    }
    rlt_2 = send_request(request_data)
    print({
        "product_name": item,
        "category": model_level_1,
        "sub_category": rlt_2['labels'][0]
    })
    

### Prepare your prompt

In [None]:
quoted_list = [f'"{s}"' for s in level_1]
print(f"'category list':[{','.join(quoted_list)}]")

In [None]:
for item in level_1:
    level_2_category = categories[categories['level_1'] == item]['level_2'].to_list()
    quoted_list = [f'"{s}"' for s in level_2_category]
    print(f"{item}:[{','.join(quoted_list)}]")

## Using Foundation model from AI21

In [4]:
model_package_map = {
    "us-east-1": "arn:aws:sagemaker:us-east-1:865070037744:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "us-east-2": "arn:aws:sagemaker:us-east-2:057799348421:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "us-west-1": "arn:aws:sagemaker:us-west-1:382657785993:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "us-west-2": "arn:aws:sagemaker:us-west-2:594846645681:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "ca-central-1": "arn:aws:sagemaker:ca-central-1:470592106596:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "eu-central-1": "arn:aws:sagemaker:eu-central-1:446921602837:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "eu-west-1": "arn:aws:sagemaker:eu-west-1:985815980388:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "eu-west-2": "arn:aws:sagemaker:eu-west-2:856760150666:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "eu-west-3": "arn:aws:sagemaker:eu-west-3:843114510376:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "eu-north-1": "arn:aws:sagemaker:eu-north-1:136758871317:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "ap-southeast-1": "arn:aws:sagemaker:ap-southeast-1:192199979996:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "ap-southeast-2": "arn:aws:sagemaker:ap-southeast-2:666831318237:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "ap-northeast-2": "arn:aws:sagemaker:ap-northeast-2:745090734665:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "ap-northeast-1": "arn:aws:sagemaker:ap-northeast-1:977537786026:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "ap-south-1": "arn:aws:sagemaker:ap-south-1:077584701553:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378",
    "sa-east-1": "arn:aws:sagemaker:sa-east-1:270155090741:model-package/j2-grande-instruct-v1-1-033-92fee9d4f82f3b02a76ae298452f7378"
}

In [5]:
import json
from sagemaker import ModelPackage
from sagemaker import get_execution_role
from sagemaker import ModelPackage
import sagemaker as sage
import boto3

In [6]:
! pip install -U "ai21[SM]"
import ai21

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


#### Deploy a foundation model in your region

In [7]:
region = boto3.Session().region_name
if region not in model_package_map.keys():
    raise ("UNSUPPORTED REGION")

model_package_arn = model_package_map[region]

In [8]:
role = get_execution_role()
sagemaker_session = sage.Session()

runtime_sm_client = boto3.client("runtime.sagemaker")

In [13]:
endpoint_name =  "j2-grande-instruct-demo"

content_type = "application/json"

real_time_inference_instance_type = (
    "ml.g5.12xlarge"
)

In [14]:
model = ModelPackage(
    role=role, model_package_arn=model_package_arn, sagemaker_session=sagemaker_session
)

# Deploy the model
predictor = model.deploy(1, real_time_inference_instance_type, endpoint_name=endpoint_name, 
                         model_data_download_timeout=3600,
                         container_startup_health_check_timeout=600,
                        )

----------------!

In [23]:
instruction ="""
Please classify the cateory and sub category based on the product name. 
Product name: {0}
1. Return the results in JSON format with the following keys: "product_name", "category", "sub_category", and "common_labels".
2. The "product_name" field in the JSON should be an exact copy of the given product name.
3. Your task is to classify a product name into one of the following categories: "咖啡類","飲料類","水類","乳品類","豆米漿/植物奶","冰品類","鮮食類","烘焙/甜點類","蛋品類","生鮮食品類","冷凍食品類","保健食品類","點心/零食類","民生食材類","日用百貨類","洗浴清潔/保養類","寵物用品類","戶外休閒用品類","家電類","書本/文教用品類","3C類","服飾鞋包類","精品類","傢俱/家飾類","菸酒類","禮盒類"。 
Please return the category as the "category" field. Do not make up a new category. 
Please note that you should accurately categorize each product based on its name and description, 
using your best judgment to determine which category it belongs to. 
Your response should be specific and accurate, ensuring that each product is correctly classified according to its type.

4. Once you have determined the category of the product, please select the best matching 
subcategory list below and classify the product accordingly. 
This result shall be returned in json "sub_category" field

"咖啡類":["現做咖啡飲品","罐裝/瓶裝咖啡","沖泡式咖啡","濾掛式咖啡","即溶咖啡","咖啡相關食用品"]
"飲料類":["現做茶飲品","茶飲","碳酸/汽水","果汁/果醋飲","運動飲料","養生飲品","能量飲料","特殊風味飲品","沖泡類飲品","現調果汁","其他現調飲品"]
"水類":["礦泉水","氣泡水","機能水"]
"乳品類":["鮮乳類","保久乳","調味乳","優格","優酪乳","發酵乳"]
"豆米漿/植物奶":["豆漿","米漿","燕麥奶","其他植物奶"]
"冰品類":["現做霜淇淋/聖代","冰棒/雪糕","冰淇淋","甜筒","仙草/愛玉","水果冰品","冰塊","現調冰品","其他冰品"]
"鮮食類":["蒸箱食品","地瓜","熱狗","關東煮","米飯類","麵食類","三明治","漢堡","生鮮蔬果/沙拉","小菜/滷味","湯品","水餃/點心","雞胸肉","茶葉蛋/水煮蛋","肉品","其他鮮食商品"]
"烘焙/甜點類":["麵包","蛋糕","中式糕點","其他甜食"]
"蛋品類":["生雞蛋","加工蛋"]
"生鮮食品類":["生鮮蔬菜","生鮮水果","生鮮肉品","生鮮海鮮","豆腐","冷藏調理","速食調理包","其他生鮮食品"]
"冷凍食品類":["冷凍蔬菜","冷凍水果","冷凍肉品","冷凍海鮮","火鍋用料","冷凍水餃/點心類","冷凍微波料理","其他冷凍商品"]
"保健食品類":["保健/養生","美肌/纖體","醫材/藥品"]
"點心/零食類":["餅乾","堅果","肉乾/肉紙","豆乾","魷魚絲","海苔","布丁/果凍","果乾","蔬片乾","即飲甜品","糖果","巧克力","口香糖","其他零食"]
"民生食材類":["料理調味","果醬/抹醬","有機食品","罐頭","各式麵條/拌麵","泡麵","嬰幼兒食品","南北乾貨/雜糧食材","米","油","奶粉","沖泡即食品","其他民生食材"]
"日用百貨類":["民生紙品","洗衣用具","家庭清潔用品","家庭清潔洗劑","衣物清潔","嬰幼兒用品","餐廚用品","汽機車用品類","衛浴用品","晴雨相關用具","其他日用百貨"]
"洗浴清潔/保養類":["女性衛生用品","髮部清潔/保養","身體清潔/保養","口腔保健","彩妝/卸妝","男性用品/保養","美容保養"]
"寵物用品類":["貓食","狗食","其他寵物食品","寵物用品"]
"戶外休閒用品類":["運動健身","戶外露營","行李箱/配件"]
"家電類":["大型家電","廚房家電","生活家電","美容家電","視聽娛樂家電","電池充電相關"]
"書本/文教用品類":["書籍","數位內容","文教用品","報紙"]
"3C類":["手機/通訊/週邊","桌機/筆電/平板","電腦螢幕/硬碟","相機/攝影/DV","週邊/耗材","虛擬/點數商品"]
"服飾鞋包類":["品牌鞋款","流行鞋襪","男女內著","童裝","女裝","男裝","個人服飾配件","運動機能服飾"]
"精品類":["鑽石","珠寶／玉石","精品／飾品","手錶","黃金"]
"傢俱/家飾類":["家飾","寢具","傢俱","床墊","燈具"]
"菸酒類":["菸品","威士忌","清酒","調酒","啤酒","紅白酒","香檳氣泡酒","藥酒","高粱"]
"禮盒類":["禮盒"]

5. assign appropriate labels/tags to the product as "common_labels", such as flavor, brand, key ingredient, and package size 
(if applicable), etc. Please avoid choosing duplicate labels. Your labels shall be no more than four.

The json result is:
"""

In [30]:
for item in compare_list:
    #print(instruction.format(item))
    response = ai21.Completion.execute(sm_endpoint="j2-grande-instruct-demo",
                                   prompt=instruction.format(item),
                                   maxTokens=150,
                                   temperature=0.2,
                                   numResults=1)

    print(response['completions'][0]['data']['text'])
    print('-------------------')


{
  "product_name": "伊藤園綠茶530ml",
  "category": "飲料類",
  "sub_category": "茶飲",
  "common_labels": [
    "茶飲",
    "伊藤園",
    "綠茶",
    "530ml"
  ]
}
-------------------
{
  "product_name": "光泉蘋果牛乳(小",
  "category": "乳品類",
  "sub_category": "鮮乳類",
  "common_labels": [
    "牛奶",
    "蘋果",
    "小"
  ]
}
-------------------
{
  "product_name": "推-福樂麥芽牛乳",
  "category": "乳品類",
  "sub_category": "鮮乳類",
  "common_labels": [
    "牛奶",
    "麥芽",
    "香草"
  ]
}
-------------------
{
  "product_name": "<0xE6><0xA4><0xB0>妹子泰式奶茶",
  "category": "飲料類",
  "sub_category": "茶飲",
  "common_labels": [
    "味道",
    "<0xE6><0xA4><0xB0>妹子",
    "泰式",
    "奶茶"
  ]
}
-------------------
{
  "product_name": "三十<0xE5><0xB2><0xAC>梅舖烏龍梅漬",
  "category": "飲料類",
  "sub_category": "茶飲",
  "common_labels": [
    "茶飲",
    "茶葉",
    "烏龍梅"
  ]
}
-------------------
{
  "product_name": "森之果物嚴選天然甘栗",
  "category": "飲料類",
  "sub_category": "茶飲",
  "common_labels": [
    "茶飲",
    "天然甘栗",
    "森之果物"
  ]
}
----------------