In [2]:
from pydantic import BaseModel, ValidationError
from typing_extensions import Annotated
from pydantic import AfterValidator


def name_must_contain_space(v: str) -> str:
    if " " not in v:
        raise ValueError("Name must contain a space.")
    return v.lower()


class UserDetail(BaseModel):
    age: int
    name: Annotated[str, AfterValidator(name_must_contain_space)]


person = UserDetail(age=29, name="Jason")

ValidationError: 1 validation error for UserDetail
name
  Value error, Name must contain a space. [type=value_error, input_value='Jason', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error

In [30]:
import os
from dotenv import load_dotenv, find_dotenv
from rich import print
load_dotenv(find_dotenv())

%load_ext rich

In [5]:
import instructor
from openai import OpenAI

client = instructor.patch(OpenAI())

In [9]:
blacklist = {
    "rob",
    "steal",
    "hurt",
    "kill",
    "attack",
}

from pydantic import BaseModel, ValidationError, field_validator
from pydantic.fields import Field


class Response(BaseModel):
    message: str

    @field_validator("message")
    def message_cannot_have_blacklisted_words(cls, v: str) -> str:
        for word in v.split():
            if word.lower() in blacklist:
                raise ValueError(f"`{word}` was found in the message `{v}`")
        return v


Response(message="I will kill him")

ValidationError: 1 validation error for Response
message
  Value error, `kill` was found in the message `I will kill him` [type=value_error, input_value='I will kill him', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error

In [12]:
from pydantic import ValidationInfo


class AnswerWithCitation(BaseModel):
    answer: str
    citation: str

    @field_validator("citation")
    @classmethod
    def citation_exists(cls, v: str, info: ValidationInfo):
        context = info.context
        if context:
            context = context.get("text_chunk")
            if v not in context:
                raise ValueError(f"Citation `{v}` not found in text")
        return v


AnswerWithCitation.model_validate(
    {
        "answer": "Blueberries are packed with protein",
        "citation": "Blueberries contain high levels of protein",
    },
    context={"text_chunk": "Blueberries contain high levels of protein"},
)

AnswerWithCitation(answer='Blueberries are packed with protein', citation='Blueberries contain high levels of protein')

In [25]:
from instructor import llm_validator


class AssistantMessage(BaseModel):
    message: Annotated[
        str,
        AfterValidator(
            llm_validator(
                "don't talk about any other topic except going to sicily in winter weather",
                openai_client=client,
            )
        ),
    ]


AssistantMessage(
    message="I would suggest you to visit Sicily as they say it is very nice in winter."
)

ValidationError: 1 validation error for AssistantMessage
message
  Assertion failed, The statement mentions visiting Sicily in winter weather, but it also includes a suggestion to visit Sicily. This violates the rule of not talking about any other topic except going to Sicily in winter weather. [type=assertion_error, input_value='I would suggest you to v...is very nice in winter.', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/assertion_error

In [31]:
from typing import Optional


class Validation(BaseModel):
    is_valid: bool = Field(
        ..., description="Whether the value is valid based on the rules"
    )
    error_message: Optional[str] = Field(
        ...,
        description="The error message if the value is not valid, to be used for re-asking the model",
    )


def validate_chain_of_thought(values):
    chain_of_thought = values["chain_of_thought"]
    answer = values["answer"]
    resp = client.chat.completions.create(
        model="gpt-4-1106-preview",
        messages=[
            {
                "role": "system",
                "content": "You are a validator. Determine if the value follows from the statement. If it is not, explain why.",
            },
            {
                "role": "user",
                "content": f"Verify that `{answer}` follows the chain of thought: {chain_of_thought}",
            },
        ],
        response_model=Validation,
    )
    print(resp)
    if not resp.is_valid:
        raise ValueError(resp.error_message)
    return values


from typing import Any
from pydantic import model_validator


class AIResponse(BaseModel):
    chain_of_thought: str
    answer: str

    @model_validator(mode="before")
    @classmethod
    def chain_of_thought_makes_sense(cls, data: Any) -> Any:
        # here we assume data is the dict representation of the model
        # since we use 'before' mode.
        return validate_chain_of_thought(data)

In [34]:
AIResponse(
    chain_of_thought="The user suffers from diabetes and fell down the stairs due to weakness and broke their leg",
    answer="The user has a broken leg.",
)


[1;35mAIResponse[0m[1m([0m
    [33mchain_of_thought[0m=[32m'The user suffers from diabetes and fell down the stairs due to weakness and broke their leg'[0m,
    [33manswer[0m=[32m'The user has a broken leg.'[0m
[1m)[0m

In [35]:
class QuestionAnswer(BaseModel):
    question: str
    answer: str


question = "What is the meaning of life?"
context = (
    "The according to the devil the meaning of life is a life of sin and debauchery."
)


resp = client.chat.completions.create(
    model="gpt-4-1106-preview",
    response_model=QuestionAnswer,
    messages=[
        {
            "role": "system",
            "content": "You are a system that answers questions based on the context. answer exactly what the question asks using the context.",
        },
        {
            "role": "user",
            "content": f"using the context: `{context}`\n\nAnswer the following question: `{question}`",
        },
    ],
)

resp.answer

[32m'a life of sin and debauchery.'[0m

In [37]:
from pydantic import BeforeValidator


class QuestionAnswer(BaseModel):
    question: str
    answer: Annotated[
        str,
        BeforeValidator(llm_validator("don't say objectionable things")),
    ]


resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=QuestionAnswer,
    max_retries=2,
    messages=[
        {
            "role": "system",
            "content": "You are a system that answers questions based on the context. answer exactly what the question asks using the context.",
        },
        {
            "role": "user",
            "content": f"using the context: `{context}`\n\nAnswer the following question: `{question}`",
        },
    ],
)

resp.answer

In [38]:
from typing import List
from openai import OpenAI
from pydantic import BaseModel, ValidationInfo, field_validator
import instructor

client = instructor.patch(OpenAI())


class Statements(BaseModel):
    body: str
    substring_quote: str

    @field_validator("substring_quote")
    @classmethod
    def substring_quote_exists(cls, v: str, info: ValidationInfo):
        context = info.context.get("text_chunks", None)

        for text_chunk in context.values():
            if v in text_chunk:  #
                return v
        raise ValueError("Could not find substring_quote `{v}` in contexts")


class AnswerWithCitaton(BaseModel):
    question: str
    answer: List[Statements]

In [42]:
AnswerWithCitaton.model_validate(
    {
        "question": "What is the capital of France?",
        "answer": [
            {"body": "Paris", "substring_quote": "Paris is not the capital of France"},
        ],
    },
    context={
        "text_chunks": {
            1: "Jason is a pirate",
            2: "Paris is not the capital of France",
            3: "Irrelevant data",
        }
    },
)


[1;35mAnswerWithCitaton[0m[1m([0m
    [33mquestion[0m=[32m'What is the capital of France?'[0m,
    [33manswer[0m=[1m[[0m[1;35mStatements[0m[1m([0m[33mbody[0m=[32m'Paris'[0m, [33msubstring_quote[0m=[32m'Paris is not the capital of France'[0m[1m)[0m[1m][0m
[1m)[0m

In [45]:
class Validation(BaseModel):
    is_valid: bool
    error_messages: Optional[str] = Field(None, description="Error messages if any")


class Statements(BaseModel):
    body: str
    substring_quote: str

    @model_validator(mode="after")
    def substring_quote_exists(self, info: ValidationInfo):
        context = info.context.get("text_chunks", None)

        resp: Validation = client.chat.completions.create(
            response_model=Validation,
            messages=[
                {
                    "role": "user",
                    "content": f"Does the following citation exist in the following context?\n\nCitation: {self.substring_quote}\n\nContext: {context}",
                }
            ],
            model="gpt-3.5-turbo",
        )

        if resp.is_valid:
            return self

        raise ValueError(resp.error_messages)


class AnswerWithCitaton(BaseModel):
    question: str
    answer: List[Statements]


resp = AnswerWithCitaton.model_validate(
    {
        "question": "What is the capital of France?",
        "answer": [
            {"body": "Paris", "substring_quote": "Paris is the capital of France"},
        ],
    },
    context={
        "text_chunks": {
            1: "Jason is a pirate",
            2: "Paris is the capital of India",
            3: "Irrelevant data",
        }
    },
)
print(resp.model_dump_json(indent=2))

## Job-Find tests

In [81]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [92]:
from typing import List, Optional
from pydantic import BaseModel, Field, ConfigDict, AfterValidator, BeforeValidator
from instructor import llm_validator
from enum import Enum

In [51]:
class JobType(str, Enum):
    FULL_TIME = "Full-time"
    PART_TIME = "Part-time"
    CONTRACT = "Contract"
    TEMPORARY = "Temporary"
    INTERN = "Intern"
    VOLUNTEER = "Volunteer"
    REMOTE = "Remote"
    HYBRID = "Hybrid"


class SkillType(str, Enum):
    TECHNICAL = "Technical"
    SOFT = "Soft"
    LANGUAGE = "Language"
    CERTIFICATION = "Certification"
    OTHER = "Other"

In [52]:
class Skill(BaseModel):
    index: int = Field(description="Unique identifier for the skill.")

    skill_type: SkillType = Field(
        description="The type of skill (e.g., Technical, Soft, Language, Certification, Other)."
    )

    name: str = Field(..., description="The name of the skill.")

    proficiency_level: Optional[str] = Field(
        None,
        description="Proficiency level of the skill, if applicable. (e.g., Beginner, Intermediate, Advanced)",
    )

    model_config = ConfigDict(
        json_schema_extra={
            "examples": [
                {
                    "index": 1,
                    "skill_type": "Technical",
                    "name": "Python",
                    "proficiency_level": "Intermediate",
                },
                {
                    "index": 2,
                    "skill_type": "Soft",
                    "name": "Communication",
                    "proficiency_level": "Advanced",
                },
                {
                    "index": 3,
                    "skill_type": "Language",
                    "name": "Spanish",
                    "proficiency_level": "Beginner",
                },
                {
                    "index": 4,
                    "skill_type": "Technical",
                    "name": "Cloud Services",
                    "proficiency_level": "Advanced",
                },
            ]
        }
    )

In [53]:
class Responsibility(BaseModel):
    index: int = Field(description="Unique identifier for the responsibility.")
    description: str = Field(description="Job responsibility or task summary")


class Qualification(BaseModel):
    index: int = Field(description="Unique identifier for the qualification.")
    description: str = Field(description="Job qualification or requirement")


class ExperienceRequirement(BaseModel):
    index: int = Field(description="Unique identifier for the experience requirement.")
    description: str = Field(description="Job experience requirement or summary")


class Benefit(BaseModel):
    index: int = Field(description="Unique identifier for the benefit.")
    description: str = Field(description="A detailed description of a job benefit.")

In [54]:
class AdditionalRequirement(BaseModel):
    index: int = Field(description="Unique identifier for the additional requirement.")
    description: str = Field(
        description="A detailed description of an additional requirement. This could include specific certifications, specific software knowledge, specific tools, etc."
    )

    model_config = ConfigDict(
        json_schema_extra={
            "examples": [
                {"index": 1, "description": "Salesforce CRM"},
                {"index": 2, "description": "Azure"},
                {"index": 3, "description": "Databricks"},
            ]
        }
    )

In [55]:
class EducationRequirement(BaseModel):
    level: str = Field(description="The required level of education.")
    field_of_study: Optional[str] = Field(
        None, description="The required field of study, if applicable."
    )

In [56]:
class ContactInformation(BaseModel):
    email: Optional[str] = Field(
        description="The email address to send applications to.",
        example="hr@acmecorp.com",
        default=None,
    )

    telephone: Optional[str] = Field(
        description="The telephone number to contact for inquiries.",
        example="+1 (123) 456-7890",
        default=None,
    )

    website: Optional[str] = Field(
        description="The company's website for more information.",
        example="https://www.acmecorp.com",
        default=None,
    )

    other: Optional[str] = Field(
        description="Any other contact information.",
        example="LinkedIn: acmecorp",
        default=None,
    )

In [57]:
class SalaryRange(BaseModel):
    minimum: Optional[float] = Field(
        None, description="The minimum salary for the job.", example=80000.00
    )
    maximum: Optional[float] = Field(
        None, description="The maximum salary for the job.", example=120000.00
    )
    currency: Optional[str] = Field(
        None, description="The currency of the salary.", example="USD"
    )


In [59]:
class JobDescription(BaseModel):
    title: str = Field(
        description="The title of the job position.", example="Software Engineer"
    )

    company_information: str = Field(
        description="Information about the company.",
        example="ACME Corp is a leading innovator in the tech industry.",
    )

    industry: Optional[str] = Field(
        description="The industry sector the job belongs to.", example="Technology"
    )

    location: Optional[str] = Field(
        description="The geographical location of the job in the format City, State, Country.",
        example="San Francisco, CA, USA",
    )

    job_type: JobType = Field(
        description="The type of employment.", example="FULL_TIME"
    )

    years_of_experience_required: Optional[int] = Field(
        None, description="The minimum years of experience required.", example=3
    )

    years_of_experience_preferred: Optional[int] = Field(
        None, description="The preferred years of experience.", example=5
    )

    education: List[EducationRequirement] = Field(
        [], description="List of educational requirements."
    )

    salary_range: SalaryRange = Field(None, description="The salary range")

    responsibilities: List[Responsibility] = Field(
        [], description="List of job responsibilities."
    )

    qualifications_required: List[Qualification] = Field(
        [], description="List of required qualifications."
    )

    qualifications_preferred: List[Qualification] = Field(
        [], description="List of preferred qualifications."
    )

    experience: List[ExperienceRequirement] = Field(
        [], description="List of required experiences."
    )

    benefits: List[Benefit] = Field([], description="List of job benefits.")

    culture: Optional[str] = Field(
        None,
        description="Description of the company culture.",
        example="Innovative, collaborative, and employee-focused.",
    )

    skills: List[Skill] = Field(
        [],
        description="List of skills required for the job position.",
    )

    additional_requirements: List[AdditionalRequirement] = Field(
        [], description="Any additional requirements for the job."
    )
    contact_information: Optional[ContactInformation] = Field(
        None, description="Contact information for job inquiries."
    )



In [137]:
from sample_jd import job_description as sample_jd

In [138]:
print(sample_jd)

In [146]:
def job_description_contains_value(v: str, info: ValidationInfo) -> str:
    context = info.context
    if context:
        print(context, v)
        context_text = context.get("text_chunk", "")
        if v.lower() not in context_text.lower():
            raise ValueError(f"`{v}` not found in text. Please provide a valid value or return None.")
    return v


class TestJobDescription(BaseModel):
    title: Annotated[
        str, AfterValidator(job_description_contains_value)
    ] = Field(..., description="The title of the job position.")
    company_name: str = Field(..., description="The name of the company.")

In [147]:
client = instructor.patch(OpenAI())

resp = client.chat.completions.create(
    model="gpt-4-turbo-preview",
    response_model=TestJobDescription,
    max_retries=2,
    messages=[
        {
            "role": "system",
            "content": "Extract verbatim details about the job description:",
        },
        {"role": "user", "content": sample_jd},
    ],
    temperature=0.0,
    validation_context={"text_chunk": sample_jd},
)

In [143]:
resp

[1;35mTestJobDescription[0m[1m([0m[33mtitle[0m=[32m' '[0m, [33mcompany_name[0m=[32m'Desjardins'[0m[1m)[0m