In [31]:
from dotenv import load_dotenv

load_dotenv()

True

In [32]:
from typing import List, Optional

from langchain.callbacks import get_openai_callback
from langchain.chat_models import ChatOpenAI

from kor.extraction import create_extraction_chain
from kor.nodes import Object, Text, Number

import pandas as pd
from pydantic import BaseModel, Field, validator
from kor import extract_from_documents, from_pydantic, create_extraction_chain


from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from utils import load_conversation

In [33]:
llm = ChatOpenAI(
    model_name="gpt-3.5-turbo",
    temperature=0,
)

In [34]:
conversation = load_conversation('/workspaces/contact-extraction/emails/sample_4.txt')
doc = Document(page_content=conversation)
split_docs = RecursiveCharacterTextSplitter().split_documents([doc])

In [35]:
schema = Object(
    id="personal_info",
    description="Personal information about a given person.",
    attributes=[
        Text(
            id="first_name",
            description="The first name of the person",
            examples=[("John Smith went to the store", "John")],
        ),
        Text(
            id="last_name",
            description="The last name of the person",
            examples=[("John Smith went to the store", "Smith")],
        ),
        Text(
            id="job_title",
            description="The job title of the person",
            examples=[("John Smith is a sales associate at a local store", "sales associate")],
        ),
        Text(
            id="company_name",
            description="The company name the person works for",
            examples=[("John Smith is a sales associate at a walmart", "walmart")],
        ),
        Text(
            id="phone_number",
            description="The phone number of the person",
            examples=[("John Smith is a sales associate and his phone number is 719-239-0231", "719-239-0231")],
        ),
        Text(
            id="email",
            description="The email of the person",
            examples=[("John Smith is a sales associate and his email is john.smith@email.com", "john.smith@email.com")],
        ),
        Text(
            id="address",
            description="The address of the company the person works for",
            examples=[("John Smith works at Data Axle 123 located at Main St, New York, NY 10001", "john.smith@email.com")],
        )        
    ],
    examples=[
        (
            """
              John Smith
              Senior Sales & Marketing Director
              
              719-239-0231
              john.smith@email.com

              Data Axle
              123 Main St, New York, NY 10001
            
              Jane Doe
              Sales Executive
              
              719-239-9999
              jane.doe@email.com

              KPMG
              123 Main St, San Franciso, CA 90909
            """,
            [
                {
                    "first_name": "John", 
                    "last_name": "Smith", 
                    "job_title": "Senior Sales & Marketing Director",
                    "company_name": "Data Axle",
                    "phone_number": "719-239-0231",
                    "email": "john.smith@email.com",
                    "address": "123 Main St, New York, NY 10001"
                },
                {
                    "first_name": "Jane", 
                    "last_name": "Doe", 
                    "job_title": "Sales Executive",
                    "company_name": "KPMG",
                    "phone_number": "719-239-9999",
                    "email": "jane.doe@email.com",
                    "address": "123 Main St, San Franciso, CA 90909"                    
                },
            ],
        )
    ],
    many=True,
)


In [36]:
chain = create_extraction_chain(
    llm,
    schema,
    encoder_or_encoder_class="csv",
    input_formatter="triple_quotes",
)

In [37]:
with get_openai_callback() as cb:
    document_extraction_results = await extract_from_documents(
        chain, split_docs, max_concurrency=5, use_uid=False, return_exceptions=True
    )
    print(f"Total Tokens: {cb.total_tokens}")
    print(f"Prompt Tokens: {cb.prompt_tokens}")
    print(f"Completion Tokens: {cb.completion_tokens}")
    print(f"Successful Requests: {cb.successful_requests}")
    print(f"Total Cost (USD): ${cb.total_cost}")

Total Tokens: 7087
Prompt Tokens: 6511
Completion Tokens: 576
Successful Requests: 4
Total Cost (USD): $0.0109185


In [None]:
import pandas as pd

def generate_dataframe(json_data):
    # Prepare an empty list to store all restaurant data
    data = []

    for record in json_data:
        restaurant_list = record.get('data', {}).get('personal_info', [])
        for restaurant in restaurant_list:
            # Get details for each restaurant and append it to data
            data.append([
                restaurant.get('first_name', ''),
                restaurant.get('last_name', ''),
                restaurant.get('job_title', ''),
                restaurant.get('company_name', ''),
                restaurant.get('phone_number', ''),
                restaurant.get('email', ''),
                restaurant.get('address', ''),

            ])

    # Convert the list into a DataFrame
    df = pd.DataFrame(data, columns=['first_name', 'last_name', 'job_title', 'company_name', 'phone_number', 'email', 'address'])

    return df

# Usage:
df = generate_dataframe(document_extraction_results)

df