In [1]:
from openai import OpenAI
from pydantic import BaseModel
import instructor


class UserDetail(BaseModel):
    name: str
    age: int


# enables `response_model` in create call
client = instructor.patch(
    OpenAI(
        base_url="http://localhost:11434/v1",
        api_key="ollama",  # required, but unused
    ),
    mode=instructor.Mode.JSON,
)

user = client.chat.completions.create(
    model="adrienbrault/nous-hermes2pro:Q8_0",
    messages=[
        {
            "role": "user",
            "content": "Jason is 30 years old",
        }
    ],
    response_model=UserDetail,
)

print(user)
#> name='Jason' age=30

name='Jason' age=30


In [2]:
from typing import Iterable

def generate_fake_users(count: int) -> Iterable[UserDetail]:
    return client.chat.completions.create(
        model="adrienbrault/nous-hermes2pro:Q8_0",
        response_model=Iterable[UserDetail],
        messages=[
            {"role": "user", "content": f"Generate a {count} synthetic users"},
        ],
    )


for user in generate_fake_users(5):
    print(user)

name='John Doe' age=32
name='Jane Smith' age=27
name='Alexander Brown' age=45
name='Alice Johnson' age=30
name='Tom Green' age=19


In [4]:
from pydantic import Field, BaseModel, model_validator, ValidationInfo
from typing import List
import re


class Fact(BaseModel):
    fact: str = Field(...)
    substring_quote: List[str] = Field(...)

    @model_validator(mode="after")
    def validate_sources(self, info: ValidationInfo) -> "Fact":
        text_chunks = info.context.get("text_chunk", None)
        spans = list(self.get_spans(text_chunks))
        self.substring_quote = [text_chunks[span[0] : span[1]] for span in spans]
        return self

    def get_spans(self, context):
        for quote in self.substring_quote:
            yield from self._get_span(quote, context)

    def _get_span(self, quote, context):
        for match in re.finditer(re.escape(quote), context):
            yield match.span()

In [5]:
class QuestionAnswer(BaseModel):
    question: str = Field(...)
    answer: List[Fact] = Field(...)

    @model_validator(mode="after")
    def validate_sources(self) -> "QuestionAnswer":
        self.answer = [fact for fact in self.answer if len(fact.substring_quote) > 0]
        return self

In [10]:
QuestionAnswer.model_json_schema()

{'$defs': {'Fact': {'properties': {'fact': {'title': 'Fact', 'type': 'string'},
    'substring_quote': {'items': {'type': 'string'},
     'title': 'Substring Quote',
     'type': 'array'}},
   'required': ['fact', 'substring_quote'],
   'title': 'Fact',
   'type': 'object'}},
 'properties': {'question': {'title': 'Question', 'type': 'string'},
  'answer': {'items': {'$ref': '#/$defs/Fact'},
   'title': 'Answer',
   'type': 'array'}},
 'required': ['question', 'answer'],
 'title': 'QuestionAnswer',
 'type': 'object'}

### Looks like complicated cases are not handled well

why? openai models are really good in handling the cases with the complicated JSON structure


In [11]:
class QuestionAnswer(BaseModel):
    question: str
    answer: str
    citations: List[str]


In [15]:
QuestionAnswer.model_json_schema()

{'properties': {'question': {'title': 'Question', 'type': 'string'},
  'answer': {'title': 'Answer', 'type': 'string'},
  'citations': {'items': {'type': 'string'},
   'title': 'Citations',
   'type': 'array'}},
 'required': ['question', 'answer', 'citations'],
 'title': 'QuestionAnswer',
 'type': 'object'}

In [12]:

def ask_ai(question: str, context: str) -> QuestionAnswer:
    return client.chat.completions.create(
        model="adrienbrault/nous-hermes2pro:Q8_0",
        temperature=0.1,
        response_model=QuestionAnswer,
        messages=[
            {
                "role": "system",
                "content": "You are a world class algorithm to answer questions with correct and exact citations.",
            },
            {"role": "user", "content": f"{context}"},
            {"role": "user", "content": f"Question: {question}"},
        ],
        validation_context={"text_chunk": context},
    )

In [14]:
question = "What did the author do during college?"
context = """
My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.
I went to an arts high school but in university I studied Computational Mathematics and physics.
As part of coop I worked at many companies including Stitchfix, Facebook.
I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.
"""

response = ask_ai(question, context)

In [16]:
response.question

'What did the author do during college?'

In [18]:
from IPython.display import display
display(response.answer)


'During college, the author studied Computational Mathematics and physics and worked at many companies including Stitchfix and Facebook. They also started the Data Science club at the University of Waterloo and served as its president for 2 years.'

In [19]:
from IPython.display import display
display(response.citations)

['My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.',
 'I went to an arts high school but in university I studied Computational Mathematics and physics.',
 'As part of coop I worked at many companies including Stitchfix, Facebook.',
 'I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.']

In [20]:
question1 = "Where was John born?"
context1 = """
John Doe is a software engineer who was born in New York, USA. 
He studied Computer Science at the Massachusetts Institute of Technology. 
During his studies, he interned at Google and Microsoft. 
He also founded the Artificial Intelligence club at his university and served as its president for three years.
"""


question2 = "What did Emily study in university?"
context2 = """
Emily Smith is a data scientist from London, England. 
She attended the University of Cambridge where she studied Statistics and Machine Learning. 
She interned at IBM and Amazon during her summer breaks. 
Emily was also the head of the Women in Tech society at her university.
"""

question3 = "Which companies did Robert intern at?"
context3 = """
Robert Johnson, originally from Sydney, Australia, is a renowned cybersecurity expert. 
He studied Information Systems at the University of Melbourne. 
Robert interned at several cybersecurity firms including NortonLifeLock and McAfee. 
He was also the leader of the Cybersecurity club at his university.
"""


question4 = "What club did Alice start at her university?"
context4 = """
Alice Williams, a native of Dublin, Ireland, is a successful web developer. 
She studied Software Engineering at Trinity College Dublin. 
Alice interned at several tech companies including Shopify and Squarespace. 
She started the Web Development club at her university and was its president for two years.
"""


question5 = "What did Michael study in high school?"
context5 = """
Michael Brown is a game developer from Tokyo, Japan. 
He attended a specialized high school where he studied Game Design. 
He later attended the University of Tokyo where he studied Computer Science. 
Michael interned at Sony and Nintendo during his university years. 
He also started the Game Developers club at his university.
"""

In [21]:
class QuestionAnswer(BaseModel):
    question: str = Field(...)
    answer: str = Field("Formated answer with citations in proper markdown format.")
    citations: List[str] = Field("List of citations for the answer.")


In [22]:

def ask_ai(question: str, context: str) -> QuestionAnswer:
    return client.chat.completions.create(
        model="adrienbrault/nous-hermes2pro:Q8_0",
        temperature=0.1,
        response_model=QuestionAnswer,
        messages=[
            {
                "role": "system",
                "content": "You are a world class algorithm to answer questions with correct and exact citations",
            },
            {"role": "user", "content": f"{context}"},
            {"role": "user", "content": f"Question: {question}"},
        ],
        validation_context={"text_chunk": context},
    )

In [23]:
from rich.pretty import pprint
response = ask_ai(question1, context1)

pprint(response.model_dump())

In [24]:
from rich.pretty import pprint
response = ask_ai(question2, context2)

pprint(response.model_dump())

In [25]:

response = ask_ai(question3, context3)

pprint(response.model_dump())

In [26]:

response = ask_ai(question4, context4)

pprint(response.model_dump())

In [27]:

response = ask_ai(question5, context5)

pprint(response.model_dump())