# Extract info from Product Description

In [16]:
from IPython.display import display, Markdown

In [None]:
import pandas as pd

## Load dataset

In [None]:
# Load dataset
df = pd.read_excel("../data/maverick-data.xlsx", sheet_name="EXAMPLE")

In [None]:
# Check dataset
with pd.option_context('display.max_rows', 50, 'display.max_columns', None):
    display(df)

In [None]:
# Keep rows with at-least one value 
mask_rows = df[["Brand Name", "Liquer Volume", "PACK SIZE", "Category", "Flavor"]].isna().sum(axis=1) < 5

In [None]:
# Check dataset
with pd.option_context('display.max_rows', 150, 'display.max_columns', None):
    display(df[mask_rows])

# 1️⃣ 1st Approach: Extract info manually (regex)

In [None]:
import pandas as pd
import re

In [None]:
# Create a sample df
df = pd.DataFrame(data = {"Product Description": [
    "CASA DRAGONES BLANCO TEQUILA",
    "CASA DRAGONES TEQUILA   ANEJO 3/CS  750ML",
    "Casa Dragones Tequila Blanco",
    "CASE   AQUAFINA WATER   SINGLE 16.9OZ  16 O",
    "CASE   BACARDI COCKTAILS   BAHAMA MAMA 4PK-12O  48 OZ",
    "CASE   BACARDI COCKTAILS   LEMONADE 4PK-355ML  48 OZ",
    "CASE   BACARDI COCKTAILS   LIME & SODA 4PK-355  48 OZ"
]})

# Create empty columns for the extracted information
df['Brand Name'] = ''
df['Liquer Volume'] = ''
df['PACK SIZE'] = ''
df['Category'] = ''
df['Flavor'] = ''

In [None]:
df

In [None]:
# Define regular expressions for pattern matching
brand_pattern = r'(.+?)(?= \d)'
liquor_pattern = r'\d+(?:\.\d+)?\s?(?:LT|L|ML|OZ|PT)'
pack_size_pattern = r'(?:\d+(?:/\d+)?\s)?(?:K(E)?G|PK|BBL)'
category_pattern = r'\b(?:VODKA|LIQUEUR|BEER|WINE|TEQUILA|WHISKEY|BOURBON|VODKA|ALE|LAGER)\b'
flavor_pattern = r'(?:\b[A-Z]+\b\s?)+'

# Iterate over each row and extract the information
for index, row in df.iterrows():
    description = row['Product Description']
    
    # Extract Brand Name
    brand_match = re.search(brand_pattern, description)
    if brand_match:
        brand_name = brand_match.group(1)
        df.at[index, 'Brand Name'] = brand_name.strip()
    
    # Extract Liquor Volume
    liquor_match = re.search(liquor_pattern, description)
    if liquor_match:
        liquor_volume = liquor_match.group()
        df.at[index, 'Liquer Volume'] = liquor_volume.strip()
    
    # Extract Pack Size
    pack_size_match = re.search(pack_size_pattern, description)
    if pack_size_match:
        pack_size = pack_size_match.group()
        df.at[index, 'PACK SIZE'] = pack_size.strip()
    
    # Extract Category
    category_match = re.search(category_pattern, description, flags=re.IGNORECASE)
    if category_match:
        category = category_match.group()
        df.at[index, 'Category'] = category.upper()
    
    # Extract Flavor
    flavor_match = re.search(flavor_pattern, description)
    if flavor_match:
        flavor = flavor_match.group()
        df.at[index, 'Flavor'] = flavor.strip()



In [None]:
# Display the updated DataFrame
display(df)

# 2️⃣ 2nd Approach: LLMS

## OpenAI

In [1]:
# Load API key from .env file
from dotenv import dotenv_values
secrets = dotenv_values("../.env")
OPENAI_API_KEY = secrets['OPENAI_API_KEY']


In [27]:
from langchain.llms import OpenAI

llm = OpenAI(model_name="text-davinci-003",
             temperature=0.5,
             max_tokens=120,
             openai_api_key=OPENAI_API_KEY)

In [30]:
from langchain import PromptTemplate

template = """Given the following product description, output a JSON with the following structure.

Brand Name:"",
Liquor Volume:"",
Pack Size:"",
Category:"",
Flavor:""

Always include all the keys even if the value is empty.

Product Description:
{product}

JSON object:
"""

prompt = PromptTemplate(template=template,
                        input_variables=["product"])

In [31]:
from langchain import LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)

In [33]:
product = "EL TESORO TEQ PARADISO ANEJO 80 750ML"
answer = llm_chain.run(product)
display(Markdown(answer))

{
    "Brand Name": "EL TESORO",
    "Liquor Volume": "80",
    "Pack Size": "750ML",
    "Category": "TEQ",
    "Flavor": "PARADISO ANEJO"
}

## GPT4ALL

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import GPT4All

## HuggingFace

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain import HuggingFaceHub

In [None]:
# repo_id = "stabilityai/stablelm-tuned-alpha-3b"
# repo_id = "google/flan-t5-xl"
# repo_id = "databricks/dolly-v2-3b"
repo_id = "ysharma/ChatGPT4"

llm = HuggingFaceHub(repo_id=repo_id, 
                    model_kwargs={"temperature":0, "max_length":64},
                    huggingfacehub_api_token="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")

In [None]:
from langchain import PromptTemplate, LLMChain

template = """Question: {question}

Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

question = "Who won the FIFA World Cup last year? "

print(llm_chain.run(question))

In [None]:
from langchain import PromptTemplate, LLMChain

template = """Question: {question}

Given the Product Description above, extract the following information; Brand Name, Liquor Volume, Pack Size, Category, and Flavor.
Return only a JSON object and nothing else.

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])
llm_chain = LLMChain(prompt=prompt, llm=llm)

question = "CASE BACARDI COCKTAILS LIME & SODA 4PK-355 48 OZ"

print(llm_chain.run(question))