# Schema.org generation

- Input: Existing scrape data, possibly just the summarized markdown
- Output:
    - Ideally, figure out the right schema to use with chain of thought
    - Maximally instantiate in Schema.org for the company

In [1]:
import json
from unified import UnifiedResult

with open("../output/data/98point6.json", "r") as f:
    data = UnifiedResult(**json.load(f))


In [3]:
dir(data)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__class_vars__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__fields__',
 '__fields_set__',
 '__format__',
 '__ge__',
 '__get_pydantic_core_schema__',
 '__get_pydantic_json_schema__',
 '__getattr__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__pretty__',
 '__private_attributes__',
 '__pydantic_complete__',
 '__pydantic_core_schema__',
 '__pydantic_custom_init__',
 '__pydantic_decorators__',
 '__pydantic_extra__',
 '__pydantic_fields_set__',
 '__pydantic_generic_metadata__',
 '__pydantic_init_subclass__',
 '__pydantic_parent_namespace__',
 '__pydantic_post_init__',
 '__pydantic_private__',
 '__pydantic_root_model__',
 '__pydantic_serializer__',
 '__pydantic_validator__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__repr_a

In [4]:
from core import init, Seed

init()

In [4]:
from schemaorg.main import Schema

organization_schema = Schema("Organization")
organization_properties = organization_schema.type_spec["properties"]
# allowed_properties

print(organization_properties)

Specification base set to https://www.schema.org
Using Version 12.0
Found https://www.schema.org/Organization
Organization: found 76 properties
https://schema.org/actionableFeedbackPolicy, https://schema.org/additionalType, https://schema.org/address, https://schema.org/aggregateRating, https://schema.org/alternateName, https://schema.org/alumni, https://schema.org/areaServed, https://schema.org/award, https://schema.org/awards, https://schema.org/brand, https://schema.org/contactPoint, https://schema.org/contactPoints, https://schema.org/correctionsPolicy, https://schema.org/department, https://schema.org/description, https://schema.org/disambiguatingDescription, https://schema.org/dissolutionDate, https://schema.org/diversityPolicy, https://schema.org/diversityStaffingReport, https://schema.org/duns, https://schema.org/email, https://schema.org/employee, https://schema.org/employees, https://schema.org/ethicsPolicy, https://schema.org/event, https://schema.org/events, https://schema.

In [6]:
organization_schema.actionableFeedbackPolicy

AttributeError: 'Schema' object has no attribute 'actionableFeedbackPolicy'

In [5]:
organization_schema.print_similar_types()

Organization
Did you mean:
ArchiveOrganization
CharitableIncorporatedOrganization
EducationalOrganization
GovernmentOrganization
MedicalOrganization
NewsMediaOrganization
Organization
OrganizationRole
SportsOrganization


In [1]:
from schemaorg.data import find_similar_types

find_similar_types("organization")

organization


['ArchiveOrganization',
 'CharitableIncorporatedOrganization',
 'EducationalOrganization',
 'GovernmentOrganization',
 'MedicalOrganization',
 'NewsMediaOrganization',
 'Organization',
 'OrganizationRole',
 'SportsOrganization']

In [37]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate


_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
f"""
You'll read information about a company and generate a json-ld representation that uses schema.org vocabulary.

These are the schema.org properties for an Organization:
{organization_properties}

When generating the json-ld representation, do not include any placeholder values; only include the properties that have values in the human input.
""" 
        ),
        (
            "human",
f"""
Company Name: {data.target.company}
Domain: {data.target.domain}

Summary:
{data.summary_markdown}

Crunchbase:
{data.crunchbase_markdown}

General search results:
{data.general_search_markdown}

Glassdoor summary:
{data.glassdoor_markdown}

Customer experience summary:
{data.customer_experience_markdown}
""",
        ),
    ]
)


llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
runnable = _prompt | llm
result = runnable.invoke({})

print(result.content)

```json
{
  "@context": "https://schema.org",
  "@type": "Organization",
  "name": "98point6 Technologies",
  "url": "http://www.98point6.com",
  "logo": "https://www.98point6.com/logo.png",  // Placeholder for logo URL
  "foundingDate": "2015-03-01",
  "founders": [
    {
      "@type": "Person",
      "name": "Jeff Greenstein"
    }
  ],
  "numberOfEmployees": 398,
  "description": "98point6 Technologies specializes in digital healthcare solutions, providing a cloud-based virtual care platform that integrates artificial intelligence with board-certified physicians to deliver primary care services.",
  "address": {
    "@type": "PostalAddress",
    "addressLocality": "Seattle",
    "addressRegion": "WA",
    "addressCountry": "USA"
  },
  "areaServed": "USA",
  "contactPoint": {
    "@type": "ContactPoint",
    "telephone": "+1-800-123-4567",  // Placeholder for telephone number
    "contactType": "Customer Service"
  },
  "employee": [
    {
      "@type": "Person",
      "name": "Ja

Specification base set to https://www.schema.org
Using Version 12.0
Found https://www.schema.org/Organization
Organization: found 76 properties


'https://schema.org/actionableFeedbackPolicy, https://schema.org/additionalType, https://schema.org/address, https://schema.org/aggregateRating, https://schema.org/alternateName, https://schema.org/alumni, https://schema.org/areaServed, https://schema.org/award, https://schema.org/awards, https://schema.org/brand, https://schema.org/contactPoint, https://schema.org/contactPoints, https://schema.org/correctionsPolicy, https://schema.org/department, https://schema.org/description, https://schema.org/disambiguatingDescription, https://schema.org/dissolutionDate, https://schema.org/diversityPolicy, https://schema.org/diversityStaffingReport, https://schema.org/duns, https://schema.org/email, https://schema.org/employee, https://schema.org/employees, https://schema.org/ethicsPolicy, https://schema.org/event, https://schema.org/events, https://schema.org/faxNumber, https://schema.org/founder, https://schema.org/founders, https://schema.org/foundingDate, https://schema.org/foundingLocation, h

'https://schema.org/actionableFeedbackPolicy, https://schema.org/additionalType, https://schema.org/address, https://schema.org/aggregateRating, https://schema.org/alternateName, https://schema.org/alumni, https://schema.org/areaServed, https://schema.org/award, https://schema.org/awards, https://schema.org/brand, https://schema.org/contactPoint, https://schema.org/contactPoints, https://schema.org/correctionsPolicy, https://schema.org/department, https://schema.org/description, https://schema.org/disambiguatingDescription, https://schema.org/dissolutionDate, https://schema.org/diversityPolicy, https://schema.org/diversityStaffingReport, https://schema.org/duns, https://schema.org/email, https://schema.org/employee, https://schema.org/employees, https://schema.org/ethicsPolicy, https://schema.org/event, https://schema.org/events, https://schema.org/faxNumber, https://schema.org/founder, https://schema.org/founders, https://schema.org/foundingDate, https://schema.org/foundingLocation, h

In [8]:
import pandas as pd

types_df = pd.read_csv("notebooks/poc/schemaorg-current-https-types.csv", index_col=0)
types_df

Unnamed: 0_level_0,label,comment,subTypeOf,enumerationtype,equivalentClass,properties,subTypes,supersedes,supersededBy,isPartOf
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
https://schema.org/3DModel,3DModel,"A 3D model represents some kind of 3D content,...",https://schema.org/MediaObject,,,"https://schema.org/about, https://schema.org/a...",,,,https://pending.schema.org
https://schema.org/AMRadioChannel,AMRadioChannel,A radio channel that uses AM.,https://schema.org/RadioChannel,,,"https://schema.org/additionalType, https://sch...",,,,
https://schema.org/APIReference,APIReference,Reference documentation for application progra...,https://schema.org/TechArticle,,,"https://schema.org/about, https://schema.org/a...",,,,
https://schema.org/Abdomen,Abdomen,Abdomen clinical examination.,https://schema.org/PhysicalExam,https://schema.org/PhysicalExam,,,,,,
https://schema.org/AboutPage,AboutPage,Web page type: About page.,https://schema.org/WebPage,,,"https://schema.org/about, https://schema.org/a...",,,,
...,...,...,...,...,...,...,...,...,...,...
https://schema.org/WritePermission,WritePermission,Permission to write or edit the document.,https://schema.org/DigitalDocumentPermissionType,https://schema.org/DigitalDocumentPermissionType,,,,,,
https://schema.org/XPathType,XPathType,Text representing an XPath (typically but not ...,https://schema.org/Text,,,,,,,https://pending.schema.org
https://schema.org/XRay,XRay,X-ray imaging.,https://schema.org/MedicalImagingTechnique,https://schema.org/MedicalImagingTechnique,,,,,,
https://schema.org/ZoneBoardingPolicy,ZoneBoardingPolicy,The airline boards by zones of the plane.,https://schema.org/BoardingPolicyType,https://schema.org/BoardingPolicyType,,,,,,


In [9]:
properties_df = pd.read_csv("notebooks/poc/schemaorg-current-https-properties.csv", index_col=0)
properties_df

Unnamed: 0_level_0,label,comment,subPropertyOf,equivalentProperty,subproperties,domainIncludes,rangeIncludes,inverseOf,supersedes,supersededBy,isPartOf
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
https://schema.org/about,about,The subject matter of the content.,,,https://schema.org/mainEntity,"https://schema.org/Certification, https://sche...",https://schema.org/Thing,https://schema.org/subjectOf,,,
https://schema.org/abridged,abridged,Indicates whether the book is an abridged edit...,,,,https://schema.org/Book,https://schema.org/Boolean,,,,https://bib.schema.org
https://schema.org/abstract,abstract,An abstract is a short description that summar...,,,,https://schema.org/CreativeWork,https://schema.org/Text,,,,https://pending.schema.org
https://schema.org/accelerationTime,accelerationTime,The time needed to accelerate the vehicle from...,,,,https://schema.org/Vehicle,https://schema.org/QuantitativeValue,,,,https://auto.schema.org
https://schema.org/acceptedAnswer,acceptedAnswer,"The answer(s) that has been accepted as best, ...",https://schema.org/suggestedAnswer,,,https://schema.org/Question,"https://schema.org/Answer, https://schema.org/...",,,,
...,...,...,...,...,...,...,...,...,...,...,...
https://schema.org/xpath,xpath,"An XPath, e.g. of a <a class=""localLink"" href=...",,,,"https://schema.org/SpeakableSpecification, htt...",https://schema.org/XPathType,,,,
https://schema.org/yearBuilt,yearBuilt,"The year an <a class=""localLink"" href=""/Accomm...",,,,https://schema.org/Accommodation,https://schema.org/Number,,,,https://pending.schema.org
https://schema.org/yearlyRevenue,yearlyRevenue,The size of the business in annual revenue.,,,,https://schema.org/BusinessAudience,https://schema.org/QuantitativeValue,,,,
https://schema.org/yearsInOperation,yearsInOperation,The age of the business.,,,,https://schema.org/BusinessAudience,https://schema.org/QuantitativeValue,,,,


In [11]:
properties_df.loc["https://schema.org/numberOfEmployees"]

label                                                 numberOfEmployees
comment               The number of employees in an organization, e....
subPropertyOf                                                       NaN
equivalentProperty                                                  NaN
subproperties                                                       NaN
domainIncludes        https://schema.org/BusinessAudience, https://s...
rangeIncludes                      https://schema.org/QuantitativeValue
inverseOf                                                           NaN
supersedes                                                          NaN
supersededBy                                                        NaN
isPartOf                                                            NaN
Name: https://schema.org/numberOfEmployees, dtype: object

In [14]:
types_df.loc["https://schema.org/QuantitativeValue"].properties

'https://schema.org/additionalProperty, https://schema.org/additionalType, https://schema.org/alternateName, https://schema.org/description, https://schema.org/disambiguatingDescription, https://schema.org/identifier, https://schema.org/image, https://schema.org/mainEntityOfPage, https://schema.org/maxValue, https://schema.org/minValue, https://schema.org/name, https://schema.org/potentialAction, https://schema.org/sameAs, https://schema.org/subjectOf, https://schema.org/unitCode, https://schema.org/unitText, https://schema.org/url, https://schema.org/value, https://schema.org/valueReference'

In [16]:
properties_df.loc["https://schema.org/value"].rangeIncludes

'https://schema.org/Boolean, https://schema.org/Number, https://schema.org/StructuredValue, https://schema.org/Text'

In [18]:
types_df.loc["https://schema.org/Number"].subTypes

'https://schema.org/Float, https://schema.org/Integer'

In [40]:
import numpy as np

def display_type_hierarchy(uri, indent = ''):
    type_row = types_df.loc[uri]

    print(f"{indent}- {type_row.label}")
    if not pd.isna(type_row.subTypes):
        for subtype_name in type_row.subTypes.split(", "):
            display_type_hierarchy(subtype_name, indent + '  ')

def display_type(type_name):
    type_id = f"https://schema.org/{type_name}"
    type_row = types_df.loc[type_id]

    print(f"# {type_name}")
    print(type_row.comment)

    print("## Properties")
    for property_uri in type_row.properties.split(", "):
        try:
            property_row = properties_df.loc[property_uri]

            if not pd.isna(property_row.supersededBy):
                continue
            print(f"- {property_row.label}: {property_row.comment}")
            print(f"  - Range: {property_row.rangeIncludes}")
        except KeyError:
            print(f"- {property_uri} (UNKNOWN)")
    
    print("## Subtypes")
    display_type_hierarchy(type_id)

display_type("Corporation")

# Corporation
Organization: A business corporation.
## Properties
- acceptedPaymentMethod: The payment method(s) that are accepted in general by an organization, or for some specific demand or offer.
  - Range: https://schema.org/LoanOrCredit, https://schema.org/PaymentMethod
- actionableFeedbackPolicy: For a <a class="localLink" href="/NewsMediaOrganization">NewsMediaOrganization</a> or other news-related <a class="localLink" href="/Organization">Organization</a>, a statement about public engagement activities (for news media, the newsroom’s), including involving the public - digitally or otherwise -- in coverage decisions, reporting and activities after publication.
  - Range: https://schema.org/CreativeWork, https://schema.org/URL
- additionalType: An additional type for the item, typically used for adding more specific types from external vocabularies in microdata syntax. This is a relationship between something and a class that the thing is in. Typically the value is a URI-identif

In [33]:
types_df.loc["https://schema.org/Organization"].subTypes

'https://schema.org/Airline, https://schema.org/Consortium, https://schema.org/Corporation, https://schema.org/EducationalOrganization, https://schema.org/FundingScheme, https://schema.org/GovernmentOrganization, https://schema.org/LibrarySystem, https://schema.org/LocalBusiness, https://schema.org/MedicalOrganization, https://schema.org/NGO, https://schema.org/NewsMediaOrganization, https://schema.org/OnlineBusiness, https://schema.org/PerformingGroup, https://schema.org/PoliticalParty, https://schema.org/Project, https://schema.org/ResearchOrganization, https://schema.org/SearchRescueOrganization, https://schema.org/SportsOrganization, https://schema.org/WorkersUnion'