## OpenAI Embedding Function

In [None]:
from openai._client import OpenAI
import pandas as pd
import tiktoken

In [28]:
client = OpenAI(api_key="")

def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model) # ['data'][0]['embedding']

In [23]:
# load & inspect dataset
input_datapath = "archive/Reviews.csv"  # to save space, we provide a pre-filtered dataset
df = pd.read_csv(input_datapath, index_col=0)
df = df[["Time", "ProductId", "UserId", "Score", "Summary", "Text"]]
df = df.dropna()
df["combined"] = (
    "Title: " + df.Summary.str.strip() + "; Content: " + df.Text.str.strip()
)
df.head(2)

Unnamed: 0_level_0,Time,ProductId,UserId,Score,Summary,Text,combined
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,1303862400,B001E4KFG0,A3SGXH7AUHU8GW,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,Title: Good Quality Dog Food; Content: I have ...
2,1346976000,B00813GRG4,A1D87F6ZCVE5NK,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,Title: Not as Advertised; Content: Product arr...


In [29]:
first_emb = get_embedding(df['combined'][1])

In [41]:
first_emb.data[0].embedding

[-0.0003650814469438046,
 0.002390077570453286,
 -0.012959975749254227,
 -0.0033447782043367624,
 -0.004680360667407513,
 0.026678388938307762,
 -0.004746890626847744,
 -0.04601190611720085,
 -0.03323821350932121,
 -0.026412270963191986,
 0.0022021313197910786,
 0.05205279961228371,
 -0.02195478416979313,
 -0.013492212630808353,
 -0.02561391517519951,
 0.01088425051420927,
 0.03994440287351608,
 0.006440069992095232,
 -0.007171896286308765,
 -0.007092060521245003,
 -0.027782781049609184,
 0.01454338151961565,
 0.0007085407851263881,
 0.0019127274863421917,
 -0.009487127885222435,
 0.003692395519465208,
 0.01208843756467104,
 -0.012381168082356453,
 -0.0013779953587800264,
 0.026492105796933174,
 0.026984425261616707,
 0.011443099938333035,
 -0.02087700366973877,
 -0.006879165768623352,
 0.015860669314861298,
 -0.002634574193507433,
 0.007344873156398535,
 -0.019559716805815697,
 0.028660973533988,
 -0.02206123247742653,
 0.0051826597191393375,
 0.00587456813082099,
 0.01519537251442670

In [36]:
len(first_emb.data[0].embedding)

1536

In [47]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return (num_tokens, encoding.encode(string))

In [48]:
num_tokens_from_string("I love you", "cl100k_base")

(3, [40, 3021, 499])

## OpenAI Assitant for extract information from JSON file

In [3]:
from openai._client import OpenAI
import time
import json
client = OpenAI(api_key="")

In [275]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "parseJsonFields",
            "description": "Extract fields from a JSON file",
            "parameters": {
                "type": "object",
                "properties": {
                    "product_info_fields": {
                        "type": "string",
                        "description": """
        The following product information should be extracted from the JSON file and please use it as format:
        - Source: DSLD
        - Product Type: Extracted from the 'langualCodeDescription' field within 'productType'
        - External ID: Extracted from the 'id' field
        - Full Name: Extracted from the 'fullName' field
        - Bundle Name: Extracted from the 'bundleName' field
        - Brand Name: Extracted from the 'brandName' field
        - Servings Per Container: Extracted from the 'servingsPerContainer' field
        - Net Contents: Extracted from the 'netContents' field
        - Physical State: Extracted from the 'langualCodeDescription' field within 'physicalState'
        - Events: Extracted from the 'events' field
        - Target Groups: Extracted from the 'targetGroups' field
        - Statements: Extracted from the 'statements' field
        - User Groups: Extracted from the first item's 'dailyValueTargetGroupName' in the 'userGroups' array
        - Off Market: Extracted from the 'offMarket' field
""",
                    },
                    "contact_info_fields": {
                        "type": "string",
                        "description": """
       The following contact information should be extracted from the JSON file and please use it as format:
        - Source: DSLD
        - External ID: Extracted from the 'contactId' field
        - Name: Extracted from the 'name' field in 'contactDetails'
        - Contact Types: Extracted from the 'types' array
        - Address: Extracted from the 'streetAddress' field in 'contactDetails'
        - City: Extracted from the 'city' field in 'contactDetails'
        - State: Extracted from the 'state' field in 'contactDetails'
        - Country: Extracted from the 'country' field in 'contactDetails'
        - Zipcode: Extracted from the 'zipCode' field in 'contactDetails'
        - Phone Number: Extracted from the 'phoneNumber' field in 'contactDetails'
        - Email: Extracted from the 'email' field in 'contactDetails'
        - Web Address: Extracted from the 'webAddress' field in 'contactDetails'
        - Notes: Extracted from the 'text' field, if available
""",
                    },
                    "ingredient_info_fields": {
                        "type": "string",
                        "description": """
        The following ingredient information should be extracted from the JSON file and please use it as format:
        - Ingredient Group: Extracted from the 'ingredientGroup' field, default to "N/A" if not present.
        - External ID: Extracted from the 'ingredientId' field.
        - Full Name: Extracted from the 'name' field.
        - Description: Extracted from the 'description' field.
        - Alternate Names: Extracted from the 'alternateNames' array.
        - Category: Extracted from the 'category' field.
        - Forms: Extracted from the 'forms' field.
        - Source: Always "DSLD".
        - Approved: Always set to True.
        - Notes: Extracted from the 'notes' field, default to an empty string if not present.

""",
                    },
                    "other_ingredient_info_fields": {
                        "type": "string",
                        "description": """
        The following other ingredient information should be extracted from the JSON file and please use it as format:
        - Ingredient Group: Extracted from the 'ingredientGroup' field, default to "N/A" if not present.
        - External ID: Extracted from the 'ingredientId' field.
        - Full Name: Extracted from the 'name' field.
        - Category: Extracted from the 'category' field.
        - Forms: Extracted from the 'forms' field.
        - Source: Always "DSLD".
        - Approved: Always set to True.
        - Notes: Always "Other Ingredient".

""",
                    },
                    "nested_ingredient_info_fields": {
                        "type": "string",
                        "description": """
        The following nested ingredient information should be extracted from the JSON file and please use it as format:
        - Ingredient Group: Extracted from the 'ingredientGroup' field, default to "N/A" if not present.
        - External ID: Extracted from the 'ingredientId' field.
        - Full Name: Extracted from the 'name' field.
        - Description: Extracted from the 'description' field.
        - Alternate Names: Extracted from the 'alternateNames' array.
        - Category: Extracted from the 'category' field.
        - Parent Ingredient External ID: Extracted from the 'ingredientId' field where the nestedRows is.
        - Parent Ingredient Full Name: Extracted from the 'name' field where the nestedRows is.
        - Forms: Extracted from the 'forms' field.
        - Source: Always "DSLD".
        - Approved: Always set to True.
        - Notes: Extracted from the 'notes' field, default to an empty string if not present.

""",
                    },
                },
                "required": [
                    "product_info_fields",
                    "contact_info_fields",
                    "ingredient_info_fields",
                    "other_ingredient_info_fields",
                    "nested_ingredient_info_fields"
                ],
            },
        },
    }
]

In [276]:
assistant = client.beta.assistants.create(
  name="JSON File Parse Assistant",
  instructions="You are an assistant designed to extract specific details from JSON files. Focus on extracting main ingredients, other ingrdients, nested ingredients, contact information, and finished product details.",
  model="gpt-4-1106-preview",
  tools=tools
)

In [261]:
json_file_path = "formula_#83020.json"  # Replace with the actual file path
with open(json_file_path, "r") as file:
    formula_json = json.load(file)

In [277]:
thread = client.beta.threads.create()

In [278]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content= f"I need to extract product information, contact information, main ingredient information, other ingredient information, nested ingredient information from the JSON file:  \n{formula_json}\n"
)

In [279]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id=assistant.id,
  instructions="Please address the user as Jason Sun. The user has a premium account."
)

In [265]:
run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id,
)
run.status

'in_progress'

In [280]:
import time
i = 0
run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id,
)
print(i*5, run.status)
while run.status != "requires_action":
    time.sleep(5)
    i += 1
    run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id,
)
    print(i*5, run.status)

0 in_progress
0 in_progress
5 in_progress
10 in_progress
15 in_progress
20 in_progress
25 in_progress
30 in_progress
35 in_progress
40 in_progress
45 requires_action


In [281]:
tool_call = run.required_action.submit_tool_outputs.tool_calls

In [290]:
json.loads(json.loads(tool_call[4].function.arguments)["nested_ingredient_info_fields"])

{'Ingredient Group': 'Proprietary Blend',
 'External ID': 42920,
 'Full Name': 'Proprietary Blend',
 'Description': 'Proprietary Blend (Form: Garlic PlantPart: bulb, Onions PlantPart: bulb, Chives Note: herb, and Scallions PlantPart: entire plant)',
 'Alternate Names': [],
 'Category': 'blend',
 'Parent Ingredient External ID': 42920,
 'Parent Ingredient Full Name': 'Proprietary Blend',
 'Forms': [{'order': 1,
   'ingredientId': 42922,
   'prefix': None,
   'percent': None,
   'name': 'Chives'},
  {'order': 2,
   'ingredientId': 12443,
   'prefix': None,
   'percent': None,
   'name': 'Garlic'},
  {'order': 3,
   'ingredientId': 42921,
   'prefix': None,
   'percent': None,
   'name': 'Onions'},
  {'order': 4,
   'ingredientId': 42923,
   'prefix': 'and',
   'percent': None,
   'name': 'Scallions'}],
 'Source': 'DSLD',
 'Approved': True,
 'Notes': ''}

In [283]:
run = client.beta.threads.runs.submit_tool_outputs(
  thread_id=thread.id,
  run_id=run.id,
  tool_outputs=[
      {
        "tool_call_id": tool_call[0].id,
        "output": "The result is correct",
      },
      {
        "tool_call_id": tool_call[1].id,
        "output": "The result is correct",
      },
      {
        "tool_call_id": tool_call[2].id,
        "output": "The result is correct",
      },
      {
        "tool_call_id": tool_call[3].id,
        "output": "The result is correct",
      },
      {
        "tool_call_id": tool_call[4].id,
        "output": "The result is correct",
      },
    ]
)


In [284]:
run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id,
)
run.status

'completed'

In [285]:
messages = client.beta.threads.messages.list(
    thread_id=thread.id
)
messages.data[0].content[0].text.value

'I have successfully extracted the required information from the JSON file. Here are the details:\n\n**Product Information:**\n- Source: DSLD\n- Product Type: Other Combinations\n- External ID: 10118\n- Full Name: Ultra MFP\n- Bundle Name: \n- Brand Name: Douglas Laboratories\n- Servings Per Container: 60\n- Net Contents: 120 Capsule(s)\n- Physical State: Capsule\n- Events: Date entered into DSLD on 2012-06-25\n- Target Groups: Adult (18 - 50 Years), Dairy Free, Sugar Free\n- Statements: \n  - A Dietary Supplement (FDA Statement of Identity)\n  - This product contains NO yeast, wheat gluten, soy protein, milk/dairy, corn, sodium, sugar, starch, artificial coloring, preservatives, or flavoring. (Formulation re: Does NOT Contain)\n  - Suggested Usage: As a dietary supplement, adults take 1 to 2 capsules daily or as directed by your healthcare professional. (Suggested/Recommended/Usage/Directions)\n  - KEEP OUT OF REACH OF CHILDREN. (Precautions re: Children)\n  - For optimal storage cond

In [108]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "process_file_finish_product",
            "description": "Extracts finished product information from the provided JSON file.",
            "parameters": {
                "type": "object",
                "properties": {
                    "source": {
                        "type": "string",
                        "description": "DSLD",
                    },
                    "productType": {
                        "type": "string",
                        "description": "productType",
                    },
                    "externalId": {
                        "type": "string",
                        "description": "externalId",
                    },
                    "fullName": {
                        "type": "string",
                        "description": "fullName",
                    },
                    "bundleName": {
                        "type": "string",
                        "description": "bundleName",
                    },
                    "brandName": {
                        "type": "string",
                        "description": "brandName",
                    },
                    "servingsPerContainer": {
                        "type": "string",
                        "description": "servingsPerContainer",
                    },
                    "netContents": {
                        "type": "object",
                        "description": "netContents",
                    },
                    "physicalState": {
                        "type": "string",
                        "description": "physicalState",
                    },
                    "events": {
                        "type": "object",
                        "description": "events",
                    },
                    "targetGroups": {
                        "type": "string",
                        "description": "targetGroups",
                    },
                    "statements": {
                        "type": "object",
                        "description": "statements",
                    },
                    "userGroups": {
                        "type": "string",
                        "description": "userGroups",
                    },
                    "offMarket": {
                        "type": "string",
                        "description": "offMarket",
                    },
                },
                "required": ["fullName"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "extract_product_details",
            "description": "Extracts ingredient, other ingredient, and contact information from a JSON file.",
            "parameters": {
                "type": "object",
                "properties": {
                    "json_file": {
                        "type": "string",
                        "description": "The path or content of the JSON file from which to extract information.",
                    }
                },
                "required": ["json_file"],
            },
        },
    },
]

In [109]:
assistant = client.beta.assistants.create(
  name="Test Functions Assistant",
  instructions="You are an assistant that parses JSON files. Use the provided function to extract fields from JSON.",
  model="gpt-4-1106-preview",
  tools=tools
)

In [110]:
json_file_path = "formula_#83020.json"  # Replace with the actual file path
with open(json_file_path, "r") as file:
    formula_json = json.load(file)

In [111]:
thread = client.beta.threads.create()

In [112]:
message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content= f"Please extract the ingredient, other ingredient, contact, and finished product information (other information) from the JSON file: {formula_json}"
)

In [113]:
run = client.beta.threads.runs.create(
  thread_id=thread.id,
  assistant_id="asst_gSYS8HBPPHYsFZ8PwMUrlnar",
  instructions="Please address the user as Jason Sun. The user has a premium account."
)

In [116]:
run = client.beta.threads.runs.retrieve(
    thread_id=thread.id,
    run_id=run.id,
)
run.status

'requires_action'

In [132]:
json.loads(tool_call[1].function.arguments)

{'fullName': 'Ultra MFP',
 'brandName': 'Douglas Laboratories',
 'servingsPerContainer': '60',
 'physicalState': 'Capsule',
 'targetGroups': 'Adult (18 - 50 Years), Dairy Free, Sugar Free',
 'userGroups': 'Adults and children 4 or more years of age'}

In [129]:
json.loads(json.loads(tool_call[0].function.arguments)["json_file"])

{'src': '01-raw/2022-10-25/data/label_717.json',
 'id': 10118,
 'nhanesId': '',
 'bundleName': '',
 'fullName': 'Ultra MFP',
 'brandName': 'Douglas Laboratories',
 'brandIpSymbol': '®',
 'upcSku': '',
 'productVersionCode': 'Formula #83020',
 'pdf': '',
 'thumbnail': '',
 'servingsPerContainer': '60',
 'hasOuterCarton': False,
 'percentDvFootnote': 'Not Present',
 'labelRelationships': [],
 'contacts': [{'contactId': 2612,
   'text': 'Manufactured in the USA by',
   'types': ['Manufacturer'],
   'contactDetails': {'src': '01-raw/2022-10-25/data/contact_1.json',
    'id': 2612,
    'name': 'Douglas Laboratories',
    'streetAddress': '600 Boyce Road',
    'city': 'Pittsburgh',
    'state': 'PA',
    'country': 'USA',
    'zipCode': '15205',
    'phoneNumber': '',
    'email': '',
    'webAddress': 'www.douglaslabs.com'}}],
 'netContents': [{'order': 1,
   'quantity': 120,
   'unit': 'Capsule(s)',
   'display': '120 Capsule(s)'}],
 'physicalState': {'langualCode': 'E0159',
  'langualCode