In [2]:
import pandas as pd
import numpy as np
import os
from groq import Groq 
from dotenv import load_dotenv

load_dotenv()

# Initialize the client
client = Groq(
    api_key=os.environ.get("GROQ_API_KEY"),
)

# Load top_words for topics

In [4]:
df = pd.read_csv('outputs/lsa_1-2gram_by_rating.csv')
df.head()

Unnamed: 0,rating,topic_number,top_words,sample_reviews
0,1.0,0,"['bottle', 'leak', 'product', 'box', 'bleach',...",['very unsafe when i open this box of three cl...
1,1.0,1,"['leak', 'damage', 'damage leak', 'arrive', 'b...",['arrive damage and leak' 'damage and leak'\n ...
2,1.0,2,"['order', 'receive', 'bleach', 'receive order'...",['this not what i order' 'i never receive the ...
3,1.0,3,"['bleach', 'spill', 'box', 'bleach spill', 'bo...",['this be not bleach' 'bleach spill out' 'box ...
4,1.0,4,"['didnt', 'item', 'didnt item', 'spill', 'clea...",['didnt get it' 'didnt get some of my item' 't...


# Cluster while summarizing using LLM

In [47]:
system_message = """Generate coherent phrases that summarize topics of customer reviews based on provided keywords. 

    If there are duplicate keywords, ensure that the phrase does not repeat the same information.
    If there are too many topics to cover in a single phrase, output as many phrases as needed to cover all topics.
    Compare the phrases with each other and combine similar phrases into a single phrase.
    For example, if the phrases are 'item did not arrive in proper condition' and 'order was damaged during shipping',
    combine them into 'item was damaged during shipping'.

    Do not return additional commentary beyond the requested phrases. Directly return the phrases without saying "Here are the summarized phrases for customer reviews based on the provided keywords:".

    """

def process_input(df):
    user_message = f'Input: "{df}'
    response = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": system_message
            },
            {
                "role": "user",
                "content": user_message
            }
        ],
        model="llama3-8b-8192",
    )
    generated_phrase = response.choices[0].message.content
    return generated_phrase

In [16]:
sample_df = df.iloc[:5, 2]
# create a list of all the words in the dataframe
all_words = []
for i in range(len(sample_df)):
    all_words.extend(sample_df[i].split())
all_words = list(set(all_words))
all_words

["['didnt',",
 "'damage',",
 "['leak',",
 "'like',",
 "'return',",
 "'spray',",
 "item',",
 "'arrive',",
 "['order',",
 "'miss',",
 "clean',",
 "bleach',",
 "'watery']",
 "'box",
 "'open',",
 "'lysol',",
 "'clean',",
 "'product',",
 "'arrive",
 "'smell',",
 "'bleach',",
 "order',",
 "damage',",
 "'clorox',",
 "'damage",
 "'receive",
 "'cover']",
 "'miss",
 "'item',",
 "box']",
 "'didnt",
 "['bleach',",
 "'bottle",
 "'receive',",
 "'leak",
 "leak',",
 "'didnt',",
 "'ship']",
 "spill',",
 "['bottle',",
 "'spill',",
 "'place',",
 "'work',",
 "'lysol",
 "'work']",
 "'bleach",
 "'box',",
 "'leak',"]

In [24]:
# clean out brackets and single quotes
all_words = [word.replace('[','').replace(']','').replace("'",'').replace(',', '') for word in all_words]
len(all_words)

48

In [49]:
results = process_input(all_words)
results = results.split(',')
results = [result.replace('"','').replace("[", '').replace(']', '').replace("'", '') for result in results]
results

['titem did not arrive',
 ' item was damaged during shipping',
 ' product did not work properly',
 ' item was received',
 ' product did not clean well',
 ' item was received in proper condition']