# Schema.org generation

- Input: Existing scrape data, possibly just the summarized markdown
- Output:
    - Ideally, figure out the right schema to use with chain of thought
    - Maximally instantiate in Schema.org for the company

In [1]:
import json
from unified import UnifiedResult

with open("../output/data/98point6.json", "r") as f:
    data = UnifiedResult(**json.load(f))


In [3]:
dir(data)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_validator__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_a

In [4]:
from core import init, Seed

init()

In [34]:
from schemaorg.main import Schema

organization_schema = Schema("Organization")
organization_properties = organization_schema.type_spec["properties"]
# allowed_properties

Specification base set to https://www.schema.org
Using Version 12.0
Found https://www.schema.org/Organization
Organization: found 76 properties


In [37]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate


_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
f"""
You'll read information about a company and generate a json-ld representation that uses schema.org vocabulary.

These are the schema.org properties for an Organization:
{organization_properties}

When generating the json-ld representation, do not include any placeholder values; only include the properties that have values in the human input.
""" 
        ),
        (
            "human",
f"""
Company Name: {data.target.company}
Domain: {data.target.domain}

Summary:
{data.summary_markdown}

Crunchbase:
{data.crunchbase_markdown}

General search results:
{data.general_search_markdown}

Glassdoor summary:
{data.glassdoor_markdown}

Customer experience summary:
{data.customer_experience_markdown}
""",
        ),
    ]
)


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
runnable = _prompt | llm
result = runnable.invoke({})

print(result.content)

```json
{
  "@context": "https://schema.org",
  "@type": "Organization",
  "name": "98point6 Technologies",
  "url": "http://www.98point6.com",
  "logo": "https://www.98point6.com/logo.png",  // Placeholder for logo URL
  "foundingDate": "2015-03-01",
  "founders": [
    {
      "@type": "Person",
      "name": "Jeff Greenstein"
    }
  ],
  "numberOfEmployees": 398,
  "description": "98point6 Technologies specializes in digital healthcare solutions, providing a cloud-based virtual care platform that integrates artificial intelligence with board-certified physicians to deliver primary care services.",
  "address": {
    "@type": "PostalAddress",
    "addressLocality": "Seattle",
    "addressRegion": "WA",
    "addressCountry": "USA"
  },
  "areaServed": "USA",
  "contactPoint": {
    "@type": "ContactPoint",
    "telephone": "+1-800-123-4567",  // Placeholder for telephone number
    "contactType": "Customer Service"
  },
  "employee": [
    {
      "@type": "Person",
      "name": "Ja

Specification base set to https://www.schema.org
Using Version 12.0
Found https://www.schema.org/Organization
Organization: found 76 properties


'https://schema.org/actionableFeedbackPolicy, https://schema.org/additionalType, https://schema.org/address, https://schema.org/aggregateRating, https://schema.org/alternateName, https://schema.org/alumni, https://schema.org/areaServed, https://schema.org/award, https://schema.org/awards, https://schema.org/brand, https://schema.org/contactPoint, https://schema.org/contactPoints, https://schema.org/correctionsPolicy, https://schema.org/department, https://schema.org/description, https://schema.org/disambiguatingDescription, https://schema.org/dissolutionDate, https://schema.org/diversityPolicy, https://schema.org/diversityStaffingReport, https://schema.org/duns, https://schema.org/email, https://schema.org/employee, https://schema.org/employees, https://schema.org/ethicsPolicy, https://schema.org/event, https://schema.org/events, https://schema.org/faxNumber, https://schema.org/founder, https://schema.org/founders, https://schema.org/foundingDate, https://schema.org/foundingLocation, h

'https://schema.org/actionableFeedbackPolicy, https://schema.org/additionalType, https://schema.org/address, https://schema.org/aggregateRating, https://schema.org/alternateName, https://schema.org/alumni, https://schema.org/areaServed, https://schema.org/award, https://schema.org/awards, https://schema.org/brand, https://schema.org/contactPoint, https://schema.org/contactPoints, https://schema.org/correctionsPolicy, https://schema.org/department, https://schema.org/description, https://schema.org/disambiguatingDescription, https://schema.org/dissolutionDate, https://schema.org/diversityPolicy, https://schema.org/diversityStaffingReport, https://schema.org/duns, https://schema.org/email, https://schema.org/employee, https://schema.org/employees, https://schema.org/ethicsPolicy, https://schema.org/event, https://schema.org/events, https://schema.org/faxNumber, https://schema.org/founder, https://schema.org/founders, https://schema.org/foundingDate, https://schema.org/foundingLocation, h