# Unstructured Data Extraction 
from a CSV using `PydanticOutputParser`

Imports

In [1]:
import pandas as pd
from typing import Optional, Sequence
from langchain_openai import OpenAI
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel

Load Your Data

In [2]:
# Load data from CSV
df = pd.read_csv("data.tsv", sep='\t')

Create Pydantic Class for your Use Case

In [3]:
# Pydantic models for competitive intelligence
class Competitor(BaseModel):
    company: str
    offering: str
    advantage: str
    products_and_services: str
    additional_details: str

class Company(BaseModel):
    """Identifying information about all competitive intelligence in a text."""
    company: Sequence[Competitor]

# Set up a Pydantic parser and prompt template
parser = PydanticOutputParser(pydantic_object=Company)
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

Define a Function to Process Each Row of Data and Extract Information

In [4]:
# Function to process each row and extract information
def process_row(row):
    _input = prompt.format_prompt(query=row['INTEL'])
    model = OpenAI(temperature=0)
    output = model(_input.to_string())
    result = parser.parse(output)
    
    # Convert Pydantic result to a dictionary
    competitor_data = result.model_dump()

    # Flatten the nested structure for DataFrame creation
    flat_data = {'INTEL': [], 'company': [], 'offering': [], 'advantage': [], 'products_and_services': [], 'additional_details': []}

    for entry in competitor_data['company']:
        flat_data['INTEL'].append(row['INTEL'])
        flat_data['company'].append(entry['company'])
        flat_data['offering'].append(entry['offering'])
        flat_data['advantage'].append(entry['advantage'])
        flat_data['products_and_services'].append(entry['products_and_services'])
        flat_data['additional_details'].append(entry['additional_details'])

    # Create a DataFrame from the flattened data
    df_cake = pd.DataFrame(flat_data)

    return df_cake

Apply the function to each row and concatenate the results

In [5]:
# Apply the function to each row and concatenate the results
intel_df = pd.concat(df.apply(process_row, axis=1).tolist(), ignore_index=True)

  warn_deprecated(
  warn_deprecated(


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

Display the Data

In [6]:
# Display the resulting DataFrame
intel_df.head(10)

Unnamed: 0,INTEL,company,offering,advantage,products_and_services,additional_details
0,Coco candy co is using a 77Tyrbo Choco machine...,Coco candy co,77Tyrbo Choco machine,coating candy gummies,candy gummies,using 77Tyrbo Choco machine to coat their cand...
1,Sugar & Spice Cookies experiments with edible ...,Sugar & Spice Cookies,edible flower decorations,PetalPrint technique,cookies,enhancing cookie appeal
2,Heavenly Sweets Patisserie streamlines pastry ...,EffiPastry,EffiPastry machine,streamlines pastry assembly,pastry assembly machines,EffiPastry machine is a game changer in the pa...
3,Cinnamon Bliss Bakery adds a secret touch of c...,Cinnamon Bliss Bakery,Chocolate Brownies,CinnaMagic ingredient,Baked Goods,Distinctive flavor with secret touch of cinnamon
4,Choco Haven factory uses organic and locally s...,Choco Haven,chocolates,organic and locally sourced ingredients,EcoCocoa brand,Shifting towards sustainable and high-quality ...
5,Decadent Delights Bakery hosts pastry decorati...,DecorPro,Decorating Classes,High-quality decorating tools,DecorPro decorating tools,Decadent Delights Bakery is considering offeri...
6,"At Velvet Frosting Cupcakes, our team learned ...",SeasonalJoy,Seasonal Pastry Menu,Rotating seasonal menu,Subscription platform,Changes monthly
7,"At Velvet Frosting Cupcakes, our team learned ...",FloralStamp,Cookie Stamper,Special touch to cookies,Cookie stamper,Customizable designs


### Save the Data to a CSV file.

Note: this CSV file can be used as input for the `unstructured_analyze_agent.ipynb` notebook.


In [8]:
# Save the DataFrame to a CSV file
! mkdir -p data
intel_df.to_csv('data/intel.csv', index=False)