Skip to content

Commit

Permalink
lint documentation (#403)
Browse files Browse the repository at this point in the history
  • Loading branch information
jxnl committed Feb 5, 2024
1 parent a56ae6b commit edc22b8
Show file tree
Hide file tree
Showing 44 changed files with 583 additions and 286 deletions.
15 changes: 5 additions & 10 deletions docs/blog/posts/anyscale.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,22 @@ class UserDetails(BaseModel):
name: str
age: int


# enables `response_model` in create call
client = instructor.patch(
OpenAI(
base_url="https://api.endpoints.anyscale.com/v1",
api_key="<YOUR_ANYSCALE_API_KEY>"
api_key="<YOUR_ANYSCALE_API_KEY>",
),
# This uses Anyscale's json schema output mode
mode=instructor.Mode.JSON_SCHEMA
mode=instructor.Mode.JSON_SCHEMA,
)

resp = client.chat.completions.create(
model="mistralai/Mixtral-8x7B-Instruct-v0.1",
messages=[
{
"role": "system",
"content": "You are a world class extractor"
},
{
"role": "user",
"content": 'Extract the following entities: "Jason is 20"'
},
{"role": "system", "content": "You are a world class extractor"},
{"role": "user", "content": 'Extract the following entities: "Jason is 20"'},
],
response_model=UserDetails,
)
Expand Down
44 changes: 27 additions & 17 deletions docs/blog/posts/caching.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,20 @@ from pydantic import BaseModel
# Enables `response_model`
client = instructor.patch(OpenAI())


class UserDetail(BaseModel):
name: str
age: int


def extract(data) -> UserDetail:
return client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=UserDetail,
messages=[
{"role": "user", "content": data},
]
)
model="gpt-3.5-turbo",
response_model=UserDetail,
messages=[
{"role": "user", "content": data},
],
)
```

Now imagine batch processing data, running tests or experiments, or simply calling `extract` multiple times over a workflow. We'll quickly run into performance issues, as the function may be called repeatedly, and the same data will be processed over and over again, costing us time and money.
Expand All @@ -53,14 +55,15 @@ Now imagine batch processing data, running tests or experiments, or simply calli
```python
import functools


@functools.cache
def extract(data):
return client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=UserDetail,
messages=[
{"role": "user", "content": data},
]
],
)
```

Expand Down Expand Up @@ -128,12 +131,13 @@ print(f"Time taken: {time.perf_counter() - start}")
import inspect
import diskcache

cache = diskcache.Cache('./my_cache_directory') # (1)
cache = diskcache.Cache('./my_cache_directory') # (1)


def instructor_cache(func):
"""Cache a function that returns a Pydantic model"""
return_type = inspect.signature(func).return_annotation
if not issubclass(return_type, BaseModel): # (2)
if not issubclass(return_type, BaseModel): # (2)
raise ValueError("The return type must be a Pydantic model")

@functools.wraps(func)
Expand Down Expand Up @@ -176,13 +180,15 @@ cache = diskcache.Cache('./my_cache_directory')

def instructor_cache(func):
"""Cache a function that returns a Pydantic model"""
return_type = inspect.signature(func).return_annotation # (4)
if not issubclass(return_type, BaseModel): # (1)
return_type = inspect.signature(func).return_annotation # (4)
if not issubclass(return_type, BaseModel): # (1)
raise ValueError("The return type must be a Pydantic model")

@functools.wraps(func)
def wrapper(*args, **kwargs):
key = f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}" # (2)
key = (
f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}" # (2)
)
# Check if the result is already cached
if (cached := cache.get(key)) is not None:
# Deserialize from JSON based on the return type (3)
Expand All @@ -197,18 +203,20 @@ def instructor_cache(func):

return wrapper


class UserDetail(BaseModel):
name: str
age: int


@instructor_cache
def extract(data) -> UserDetail:
return client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=UserDetail,
messages=[
{"role": "user", "content": data},
]
],
)
```

Expand All @@ -232,6 +240,7 @@ def extract(data) -> UserDetail:

cache = redis.Redis("localhost")


def instructor_cache(func):
"""Cache a function that returns a Pydantic model"""
return_type = inspect.signature(func).return_annotation
Expand Down Expand Up @@ -264,7 +273,6 @@ def extract(data) -> UserDetail:
import redis
import functools
import inspect
import json
import instructor

from pydantic import BaseModel
Expand All @@ -273,15 +281,16 @@ from openai import OpenAI
client = instructor.patch(OpenAI())
cache = redis.Redis("localhost")


def instructor_cache(func):
"""Cache a function that returns a Pydantic model"""
return_type = inspect.signature(func).return_annotation
if not issubclass(return_type, BaseModel): # (1)
if not issubclass(return_type, BaseModel): # (1)
raise ValueError("The return type must be a Pydantic model")

@functools.wraps(func)
def wrapper(*args, **kwargs):
key = f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}" # (2)
key = f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}" # (2)
# Check if the result is already cached
if (cached := cache.get(key)) is not None:
# Deserialize from JSON based on the return type
Expand All @@ -301,6 +310,7 @@ class UserDetail(BaseModel):
name: str
age: int


@instructor_cache
def extract(data) -> UserDetail:
# Assuming client.chat.completions.create returns a UserDetail instance
Expand All @@ -309,7 +319,7 @@ def extract(data) -> UserDetail:
response_model=UserDetail,
messages=[
{"role": "user", "content": data},
]
],
)
```

Expand Down
2 changes: 1 addition & 1 deletion docs/blog/posts/chain-of-density.md
Original file line number Diff line number Diff line change
Expand Up @@ -478,7 +478,7 @@ instructor jobs create-from-file generated.jsonl
Once the job is complete, all we need to do is to then change the annotation in the function call to `distil_summarization` in our original file above to start using our new model.

```py
@instructions.distil(model='gpt-3.5-turbo:finetuned-123', mode="dispatch") #(1)!
@instructions.distil(model='gpt-3.5-turbo:finetuned-123', mode="dispatch") # (1)!
def distil_summarization(text: str) -> GeneratedSummary:
summary_chain: List[str] = summarize_article(text)
return GeneratedSummary(summary=summary_chain[-1])
Expand Down
7 changes: 4 additions & 3 deletions docs/blog/posts/citations.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,14 @@ In this example, we use the `Statements` class to verify if a given substring qu
### Code Example:

```python
from typing import List, Optional
from typing import List
from openai import OpenAI
from pydantic import BaseModel, Field, ValidationError, ValidationInfo, field_validator, model_validator
from pydantic import BaseModel, ValidationInfo, field_validator
import instructor

client = instructor.patch(OpenAI())


class Statements(BaseModel):
body: str
substring_quote: str
Expand All @@ -44,7 +45,7 @@ class Statements(BaseModel):
context = info.context.get("text_chunks", None)

for text_chunk in context.values():
if v in text_chunk: # (1)
if v in text_chunk: # (1)
return v
raise ValueError("Could not find substring_quote `{v}` in contexts")

Expand Down
40 changes: 28 additions & 12 deletions docs/blog/posts/distilation-part1.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ Before we dig into the nitty-gritty, let's look at how easy it is to use Instruc
import logging
import random
from pydantic import BaseModel
from instructor import Instructions # pip install instructor
from instructor import Instructions # pip install instructor

# Logging setup
logging.basicConfig(level=logging.INFO)
Expand All @@ -43,14 +43,16 @@ instructions = Instructions(
# log handler is used to save the data to a file
# you can imagine saving it to a database or other storage
# based on your needs!
log_handlers=[logging.FileHandler("math_finetunes.jsonl")]
log_handlers=[logging.FileHandler("math_finetunes.jsonl")],
)


class Multiply(BaseModel):
a: int
b: int
result: int


# Define a function with distillation
# The decorator will automatically generate a dataset for fine-tuning
# They must return a pydantic model to leverage function calling
Expand All @@ -59,11 +61,22 @@ def fn(a: int, b: int) -> Multiply:
resp = a * b
return Multiply(a=a, b=b, result=resp)


# Generate some data
for _ in range(10):
a = random.randint(100, 999)
b = random.randint(100, 999)
print(fn(a, b))
#> a=873 b=234 result=204282
#> a=902 b=203 result=183106
#> a=962 b=284 result=273208
#> a=491 b=739 result=362849
#> a=193 b=400 result=77200
#> a=300 b=448 result=134400
#> a=952 b=528 result=502656
#> a=574 b=797 result=457478
#> a=482 b=204 result=98328
#> a=781 b=278 result=217118
```

## The Intricacies of Fine-tuning Language Models
Expand All @@ -90,17 +103,17 @@ Here's how the logging output would look:
"messages": [
{"role": "system", "content": 'Predict the results of this function: ...'},
{"role": "user", "content": 'Return fn(133, b=539)'},
{"role": "assistant",
"function_call":
{
"name": "Multiply",
"arguments": '{"a":133,"b":539,"result":89509}'
}
}
{
"role": "assistant",
"function_call": {
"name": "Multiply",
"arguments": '{"a":133,"b":539,"result":89509}',
},
},
],
"functions": [
{"name": "Multiply", "description": "Correctly extracted `Multiply`..."}
]
],
}
```

Expand All @@ -121,18 +134,21 @@ Here's a sneak peek of what I'm planning:
```python
from instructor import Instructions, patch

patch() #(1)!
patch() # (1)!


class Multiply(BaseModel):
a: int
b: int
result: int


instructions = Instructions(
name="three_digit_multiply",
)

@instructions.distil(model='gpt-3.5-turbo:finetuned-123', mode="dispatch") # (2)!

@instructions.distil(model='gpt-3.5-turbo:finetuned-123', mode="dispatch") # (2)!
def fn(a: int, b: int) -> Multiply:
resp = a + b
return Multiply(a=a, b=b, result=resp)
Expand Down
Loading

0 comments on commit edc22b8

Please sign in to comment.