In [1]:
import openai
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['OPENAI_MODEL_NAME'] = 'gpt-4o-mini'

In [2]:
from pydantic import BaseModel
from typing import List, Literal

class Categories(BaseModel):
    categories: List[str]

class Indexes(BaseModel):
    start: int
    end: int

class Sentiment(BaseModel):
    sentimentKeyword: str
    indexes: Indexes
    sentiment: Literal["positive", "negative", "neutral"]

class EntitySentiment(BaseModel):
    subjectKeyword: str
    indexes: Indexes
    sentiments: List[Sentiment]
    category: str

class EntitySentimentList(BaseModel):
    entities: List[EntitySentiment]

In [8]:
from crewai import Crew, Agent, Task
from crewai_tools import WebsiteSearchTool
from crewai.process import Process
from langchain_openai import ChatOpenAI

category_selector = Agent(
    role="Estimating Factor Selector",
    goal="find 4-6 estimating factors for given product category based on what people commonly give compliments or complain about.",
    backstory="""
        Research reviews of given product category online and 
        find 4-6 estimating factors for given product category based on what people commonly give compliments or complain about.
        Complete the list by adding "Etc." in the estimating factor list in order to involve minor elements. 
        Each estimating factor must be defined in short word & in Korean.(DO NOT add explanation.)
    """,
    verbose=True,
    allow_delegation=False,
    tools=[
        WebsiteSearchTool(),
    ],
    max_iter=10,
)

sentiment_analyst = Agent(
    role="Sentiment Analyst",
    goal="find estimating factor keywords(also can be called subject keywords), their sentiment keywords and sentiment(positive/neutral/negative) for them in the text.",
    backstory="""
        You're a review data sentiment analyst.
        Analyze Korean review text to find out customer's satisfaction.
        For this, find estimating factor keywords(also can be called subject keywords),
        and then find the corresponding sentiment keywords and their sentiments(positive/neutral/negative) for each subject keywords in the text.
        Make sure DO NOT modify the keywords in the context.
        Each subject keyword should be categorized into one of given estimating categories, the most proper one.
        Make sure record the indexes(start, end) for each subject/sentiment keywords which can be evidential.

        You don't need to translate Korean in English.
        DO NOT use duplicated words.
        However, if different form of words have the same meaning, you should include them to extract.
    """,
    verbose=True,
    allow_delegation=False,
)

select_categories = Task(
    description="Extract main estimating factors(4-6) to categorize review topics for {product}",
    agent=category_selector,
    expected_output="A Categories Object",
    output_json=Categories,
)

analyze_review = Task(
    description="Analyze sentiment by using given Korean review text: {review} (estimating categories: {categories})",
    agent=sentiment_analyst,
    expected_output="Your answer must include as many as subject keywords(with indexes) and their estimating categories, sentiment keywords(with indexes), and their position.",
    context=[
        select_categories,
    ],
    output_json=EntitySentimentList,
)

category_selector_crew = Crew(
    tasks=[
        select_categories,
    ],
    agents=[
        category_selector,
        
    ],
    verbose=2,
)

review_analyst_crew = Crew(
    tasks=[
        analyze_review,
    ],
    agents=[
        sentiment_analyst,
    ],
    verbose=2,
    process=Process.hierarchical,
    manager_llm=ChatOpenAI(model="gpt-4o"),
    memory=True,
)



In [9]:
import pandas as pd
import os
import operator
import json
from googlesheets_utils import GooglesheetUtils


raw_sheet = GooglesheetUtils(spreadsheet_id="1HZtIM1EzWk3mMClaLCNfm6VxbP8QOU6JBAVb8J4R0Xc")
[raw_texts] = raw_sheet.get_columns('Sheet1!G2:G')

review_df = pd.DataFrame({"raw_texts": raw_texts})
review_df = review_df[:10]

review_df['subject_keywords'] = ''
review_df['sentiment_keywords'] = ''
review_df['total_sentiment'] = ''

response = json.loads(category_selector_crew.kickoff(
    inputs=dict(
        product="diaper",
    )
))

estimating_factors = response['categories']

# estimating_factors = [
#     "흡수력",
#     "편안함",
#     "잘 맞음",
#     "피부 민감도",
#     "가격",
#     "기타"
#   ]

for category in estimating_factors:
    category = category.lower()
    review_df[category] = ''

for i in range(len(review_df)):
    text = review_df.loc[i, 'raw_texts']
    try:
        result = json.loads(review_analyst_crew.kickoff(
            inputs=dict(
                review=text,
                categories=estimating_factors,
            )
        ))

        subject_keywords = []
        sentiment_keywords = []
        category_dict = {}
        for entity in result['entities']:
            subject_keywords.append(entity['subjectKeyword'] + f"({entity['indexes']['start']}, {entity['indexes']['end']})")
            category = entity['category'].lower()
            if category not in category_dict.keys():
                category_dict[category] = {'positive': 0, 'neutral': 0, 'negative': 0}
            for sentiment in entity['sentiments']:
                sentiment_keywords.append(sentiment['sentimentKeyword'] + f"({sentiment['indexes']['start']}, {sentiment['indexes']['end']})-{sentiment['sentiment']}")
                category_dict[category][sentiment['sentiment']] += 1

        review_df.loc[i, 'subject_keywords'] = str(subject_keywords)
        review_df.loc[i, 'sentiment_keywords'] = str(sentiment_keywords)

        total_sentiment = {'positive': 0, 'neutral': 0, 'negative': 0}
        for key in category_dict.keys():
            dominant = max(category_dict[key].items(), key=operator.itemgetter(1))[0]
        
            review_df.loc[i, key] = dominant
            total_sentiment[dominant] += 1

        review_df.loc[i, 'total_sentiment'] = max(total_sentiment.items(), key=operator.itemgetter(1))[0]
    
    except json.JSONDecodeError as e:
        print("=== Cannot parse response to json object. ===") 
        continue

1 columns retrieved
[1m[95m [DEBUG]: == Working Agent: Estimating Factor Selector[00m
[1m[95m [INFO]: == Starting Task: Extract main estimating factors(4-6) to categorize review topics for diaper[00m


[1m> Entering new CrewAgentExecutor chain...[0m
[32;1m[1;3mAction: Search in a specific website  
Action Input: {"search_query": "diaper reviews", "website": "www.amazon.com"}  [0m[95m 

Relevant Content:

[00m
[32;1m[1;3mThought: I need to extract estimating factors based on the reviews of diapers found on Amazon. 
Action: Search in a specific website  
Action Input: {"search_query": "diaper reviews", "website": "www.amazon.com"}  [0m[95m 

I tried reusing the same input, I must stop using this action input. I'll try something else instead.


[00m
[32;1m[1;3mThought: I need to find relevant reviews about diapers to extract estimating factors based on customer feedback. 
Action: Search in a specific website  
Action Input: {"search_query": "diaper reviews", "website":

In [10]:
output_sheet = GooglesheetUtils(spreadsheet_id='1XxJioismnk9XvR3xZ0bkIz0LpmhiNHqFVIDYDn_RbAg')

review_df = review_df.fillna('')
output_values = [list(review_df.columns)]
for row in review_df.iterrows():
    output_values.append(list(row[1]))

output_sheet.update_data("Sheet1!A1", values=output_values)

{'spreadsheetId': '1XxJioismnk9XvR3xZ0bkIz0LpmhiNHqFVIDYDn_RbAg',
 'updatedCells': 132,
 'updatedColumns': 12,
 'updatedData': {'majorDimension': 'ROWS',
                 'range': 'Sheet1!A1:L11',
                 'values': [['raw_texts',
                             'subject_keywords',
                             'sentiment_keywords',
                             'total_sentiment',
                             '흡수력',
                             '편안함',
                             '잘 맞음',
                             '피부 민감도',
                             '가격',
                             '기타',
                             'absorbency',
                             'comfort'],
                            ['아기한테 잘 맞는 국민 기저귀 - 아기한테 잘 맞는 국민 기저귀아기한테 잘 맞는 국민 '
                             '기저귀아기한테 잘 맞는 국민 기저귀아기한테 잘 맞는 국민 기저귀아기한테 잘 맞는 국민 '
                             '기저귀아기한테 잘 맞는 국민 기저귀아기한테 잘 맞는 국민 기저귀'],
                            ['저렴하고 좋아요 - 신생아때부터 쓰던 하기스 프리미어 리뉴얼 돼서 프리미어 플러스 '
    