In [1]:
# import necessary packages
import pandas as pd
import os
from openai import AzureOpenAI
from dotenv import load_dotenv
import json

In [2]:
#import the data we will be working with later
reviews = pd.read_csv('cleaned_makeup_reviews.csv', index_col=0)
reviews = pd.DataFrame(reviews)

  reviews = pd.read_csv('cleaned_makeup_reviews.csv', index_col=0)


In [3]:
print(reviews.columns)
print(reviews.head())


Index(['product_link_id', 'review_id', 'type', 'id', 'ugc_id', 'legacy_id',
       'internal_review_id', 'headline', 'nickname', 'created_date',
       'updated_date', 'rating', 'helpful_votes', 'not_helpful_votes', 'uri',
       'comments', 'locale', 'location', 'bottom_line', 'product_page_id',
       'upc', 'gtin', 'is_staff_reviewer', 'is_verified_buyer',
       'is_verified_reviewer', 'helpful_score'],
      dtype='object')
                  product_link_id  review_id   type           id  ugc_id  \
unique_review_id                                                           
1                               8  521867580  image  521867563.0     NaN   
2                               8  485614895  image  485614799.0     NaN   
3                               8  485250956  image  485250927.0     NaN   
4                               8  485086522  video  485086531.0     NaN   
5                               8  482905142  image  482905180.0     NaN   

                  legacy_id  inter

In [4]:
# create a new dataframe with only the columns we need
columns = ['product_link_id', 'review_id', 'comments']
reviewsFocused = reviews[columns]

In [5]:
print(reviewsFocused['comments'].isnull().sum())

261865


In [6]:
#remove reviews without comments
reviewsFocused = reviewsFocused.dropna()
reviewsFocused

Unnamed: 0_level_0,product_link_id,review_id,comments
unique_review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
261850,8,525372766,it was very oily. Would like very much to b...
261851,8,524734115,This is a wonderful foundation for mature skin...
261852,8,524636927,I loved this. Gave me a hint of color with a g...
261853,8,524362071,"This liquid foundation is like essence liquid,..."
261854,8,523262050,Finally! A lightweight face product with just ...
...,...,...,...
314025,1135,525249842,I have to keep this out of my makeup bag becau...
314026,1135,524832825,"While I like the contour, the applicator is me..."
314027,1135,524524028,Ive had this product for months and it never r...
314028,1135,524524101,Amazing and it stays! One dot goes a long way ...


In [7]:
#select 10 rows/reviews at random to test how the api responds with our prompt
test_reviews = reviewsFocused.sample(10)

In [8]:
# Create prompt info for the API
context = "You are a data analyst assistant for understanding reviews for products at a cosmetics company."
prompt = '''your entire response/output is going to consist of a single JSON array, and you will NOT wrap it within JSON md markers. Response should be valid JSON.
            Given the following reviews, state for each review if the product is related to color in any way. Output should look like: unique_review_id, topics,  color_related: boolean. 
            Example: [{ review_id: 1, topics: ["some topic"], color_related: true }]
            '''

In [9]:
#test your access to open ai model is working before we try anything fancy
load_dotenv(override=True)

client = AzureOpenAI(
        api_key = os.getenv('AZURE_OPENAI_KEY'),
        azure_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT'),
        api_version = os.getenv('API_VERSION')
        
        )

In [10]:
import json
import pandas as pd

def send_reviews_toGPT(data, n, context, prompt):
    collected_contents = []
    for i in range(0, len(data), n):
        batch = data[i:i+n]
        batch_rows = []
        for index, row in batch.iterrows():
            row_string = ", ".join([f"{column}: {row[column]}" for column in data.columns])
            batch_rows.append(row_string)
        
        batch_content = "\n".join(batch_rows)

        try:          
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", 
                        "content": context},
                    {"role": "user", 
                        "content": f"{prompt}\n\nData:\n{batch_content}"}
                ],
                temperature=0.0

            )
            
            print(response.choices[0].message.content)
            responses = json.loads(response.choices[0].message.content)

            # Save collected content to list
            collected_contents.extend(responses)
        except Exception as e:
            print(f'API Call Failed: {str(e)}')

    # Print all collected contents
    print(collected_contents)

    # Return all collected contents as a JSON array
    return collected_contents

# Example usage
results = send_reviews_toGPT(test_reviews, 2, context, prompt)


[
    {
        "review_id": 514073427,
        "topics": ["application", "usage"],
        "color_related": false
    },
    {
        "review_id": 446482126,
        "topics": ["product quality", "customer service"],
        "color_related": false
    }
]
[
    {
        "review_id": 414037776,
        "topics": ["shade", "shimmer", "skintone", "long lasting", "double cleanse"],
        "color_related": true
    },
    {
        "review_id": 469398844,
        "topics": ["highlighter", "clients", "skin", "lasting"],
        "color_related": false
    }
]
[
    {
        "review_id": 523941344,
        "topics": ["dry skin", "hype"],
        "color_related": false
    },
    {
        "review_id": 507237170,
        "topics": ["shade", "long lasting", "concealer", "brighten effect"],
        "color_related": true
    }
]
[
    {
        "review_id": 486521818,
        "topics": ["blush", "makeup routine", "application technique"],
        "color_related": true
    },
    {
        "re

In [11]:
results

[{'review_id': 514073427,
  'topics': ['application', 'usage'],
  'color_related': False},
 {'review_id': 446482126,
  'topics': ['product quality', 'customer service'],
  'color_related': False},
 {'review_id': 414037776,
  'topics': ['shade', 'shimmer', 'skintone', 'long lasting', 'double cleanse'],
  'color_related': True},
 {'review_id': 469398844,
  'topics': ['highlighter', 'clients', 'skin', 'lasting'],
  'color_related': False},
 {'review_id': 523941344,
  'topics': ['dry skin', 'hype'],
  'color_related': False},
 {'review_id': 507237170,
  'topics': ['shade', 'long lasting', 'concealer', 'brighten effect'],
  'color_related': True},
 {'review_id': 486521818,
  'topics': ['blush', 'makeup routine', 'application technique'],
  'color_related': True},
 {'review_id': 498168998,
  'topics': ['redness reduction', 'makeup setting'],
  'color_related': False},
 {'review_id': 425051392,
  'topics': ['makeup', 'texture', 'color', 'blendable', 'comfortable'],
  'color_related': True},
 

In [12]:
df = pd.DataFrame(results)
print(df)

   review_id                                             topics  color_related
0  514073427                               [application, usage]          False
1  446482126                [product quality, customer service]          False
2  414037776  [shade, shimmer, skintone, long lasting, doubl...           True
3  469398844              [highlighter, clients, skin, lasting]          False
4  523941344                                   [dry skin, hype]          False
5  507237170  [shade, long lasting, concealer, brighten effect]           True
6  486521818     [blush, makeup routine, application technique]           True
7  498168998                [redness reduction, makeup setting]          False
8  425051392   [makeup, texture, color, blendable, comfortable]           True
9  522365720  [light coverage, sunscreen protection, natural...          False


In [13]:
df.to_csv('test_results.csv', index=False)