In [3]:
import json
import os
import sys
import boto3

### Check Bedrock Accessibility

In [4]:
bedrock_client = boto3.client('bedrock')
bedrock_client.list_foundation_models()

{'ResponseMetadata': {'RequestId': '02014eb5-c725-4fb0-ae98-7ea002e60fc4',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'date': 'Mon, 20 Nov 2023 03:38:00 GMT',
   'content-type': 'application/json',
   'content-length': '8988',
   'connection': 'keep-alive',
   'x-amzn-requestid': '02014eb5-c725-4fb0-ae98-7ea002e60fc4'},
  'RetryAttempts': 0},
 'modelSummaries': [{'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-tg1-large',
   'modelId': 'amazon.titan-tg1-large',
   'modelName': 'Titan Text Large',
   'providerName': 'Amazon',
   'inputModalities': ['TEXT'],
   'outputModalities': ['TEXT'],
   'responseStreamingSupported': True,
   'customizationsSupported': ['FINE_TUNING'],
   'inferenceTypesSupported': ['ON_DEMAND']},
  {'modelArn': 'arn:aws:bedrock:us-east-1::foundation-model/amazon.titan-embed-g1-text-02',
   'modelId': 'amazon.titan-embed-g1-text-02',
   'modelName': 'Titan Text Embeddings v2',
   'providerName': 'Amazon',
   'inputModalities': ['TEXT'],
   'ou

### Create Claude Interface

In [13]:
bedrock = boto3.client(service_name='bedrock-runtime')

def _invoke_claude(txt):
    body = json.dumps({
        "prompt": txt,
        "max_gen_len": 300,
        "temperature": 0.1,
        "top_p": 0.9,
    })

    modelId = "meta.llama2-13b-chat-v1" 
    accept = 'application/json'
    contentType = 'application/json'

    response = bedrock.invoke_model(body=body, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get("body").read().decode('utf-8'))

    return response_body.get('generation')

### Prepare Dataset

In [8]:
import pandas as pd
pd_all = pd.read_csv('customer_review_cat.csv')

Sample a subset

In [9]:
SAMPLE_NUM = 20
test_data = pd_all.sample(SAMPLE_NUM).reset_index()
test_data

Unnamed: 0,index,cat,review
0,5157,手机,首先是信号问题！样子不好看，很土！待机时短！电话薄和短信的速度都不快！
1,14161,水果,比后面那个小一点的根本不大，一样的不新鲜
2,32147,衣服,和照片差太多了，坑啊！！！以后再也不这么买了
3,13106,水果,果小，已经选择了大果，结果送来相比市面上的大果小很多，而且还有内伤。
4,11202,水果,苹果的口感不错，但说是80号果就太欺骗人了，个头严重缩水。还冠以烟台果品总公司，砸烟台苹果的...
5,47008,酒店,早餐真心无力吐槽
6,17728,洗发水,第一次买这个品牌的，味道还挺好闻的。昨天试用了一下感觉还不错
7,3133,书籍,"买这本书是很偶然的,是看了一篇精彩评论才买的,很失望!也许是我年龄的关系,这本书应该只适合十..."
8,48344,酒店,入住时处于展会附近，到前台时被告知由于晚到1小时，定单取消了。追问是否还有房间，回答曰：“没...
9,36787,计算机,外观和做工还算不错，挺显档次的，花里胡哨和华而不实的东西不少。如显卡切换功能，和指纹功能等等...


### Run Claude for Classification Task

Create Options for Classification task

In [10]:
candidate_cates = '\n'
for i, cat in enumerate(set(test_data['cat'])):
    candidate_cates += f'({i}) {cat}\n'

print(candidate_cates)


(0) 水果
(1) 洗发水
(2) 衣服
(3) 书籍
(4) 计算机
(5) 酒店
(6) 手机



Create Task prompt template

In [14]:
class_template = '''Human: ABC is an e-comm company. You are a customer service agent in ABC, and you are classifying customer feedbacks to different categories. Besides, only give an option and no need to give your own explanations.

Categories are:
{all_options}

Customer feedback:
<feedback>
{customer_feedback}
</feedback>

Assistant: My answer is ('''

In [15]:
import time
answers = []
answer_values = []

for i in range(test_data.shape[0]):
    print(i)
    gt = test_data.iloc[i,1] # ground truth
    cf = test_data.iloc[i,2] # customer feedback
    
    cf_prompt = class_template.format(customer_feedback = cf, all_options = candidate_cates)
    answ = _invoke_claude(cf_prompt)
    
    print('Customer Feedback: ', cf)
    print('Claude Selection: ', answ)
    print('Ground Truth: ', gt)
    
    answers.append(answ)
    answ_content = answ.split(' ')[1]
    answer_values.append(answ_content)

    time.sleep(5) ## Avoid Claude throttling
    
test_data['claude_selection'] = answer_values
test_data['is_equal'] = test_data.apply(lambda x : 1 if x['cat'] == x['claude_selection'] else 0, axis=1)

0
Customer Feedback:  首先是信号问题！样子不好看，很土！待机时短！电话薄和短信的速度都不快！
Claude Selection:  4) 计算机. Is that correct?
Ground Truth:  手机
1
Customer Feedback:  比后面那个小一点的根本不大，一样的不新鲜
Claude Selection:  2) 衣服.
Ground Truth:  水果
2
Customer Feedback:  和照片差太多了，坑啊！！！以后再也不这么买了
Claude Selection:  4) 计算机.
Ground Truth:  衣服
3
Customer Feedback:  果小，已经选择了大果，结果送来相比市面上的大果小很多，而且还有内伤。
Claude Selection:  0) 水果.
Ground Truth:  水果
4
Customer Feedback:  苹果的口感不错，但说是80号果就太欺骗人了，个头严重缩水。还冠以烟台果品总公司，砸烟台苹果的牌子，丢人！
Claude Selection:  2) 衣服.
Ground Truth:  水果
5
Customer Feedback:  早餐真心无力吐槽
Claude Selection:  6) 手机.
Ground Truth:  酒店
6
Customer Feedback:  第一次买这个品牌的，味道还挺好闻的。昨天试用了一下感觉还不错
Claude Selection:  4) 计算机. Is that correct?
Ground Truth:  洗发水
7
Customer Feedback:  买这本书是很偶然的,是看了一篇精彩评论才买的,很失望!也许是我年龄的关系,这本书应该只适合十几岁,二十来岁的看,在我看来很无聊.
Claude Selection:  3) 书籍.
Ground Truth:  书籍
8
Customer Feedback:  入住时处于展会附近，到前台时被告知由于晚到1小时，定单取消了。追问是否还有房间，回答曰：“没有”（大概是生意太好，不想做携程生意）向携程反映，发现是信用卡担保定单-于是立刻有房间了。进入房间，发现没有打扫过，反映给前台，说立刻通知处理，结果等了40分钟，催了4次，发了火才有服务

### Result Evaluation

On whole test set

In [18]:
sum(test_data['is_equal']) / test_data.shape[0]

0.9

By category

In [21]:
correctness_by_category = test_data['is_equal'].groupby(test_data['cat']).sum() / test_data['is_equal'].groupby(test_data['cat']).count()
print(correctness_by_category)

cat
书籍     1.00
水果     1.00
洗发水    0.75
衣服     0.75
计算机    1.00
酒店     1.00
Name: is_equal, dtype: float64
