# set up

In [29]:
import os
from dotenv import load_dotenv

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
MODEL = "gpt-4o-mini"
# MODEL = "gpt-3.5-turbo"

# read files

In [30]:
import os

# Directory containing the Markdown files
dir_path = os.path.abspath("../data/math_books")
texts = []
# Iterate over each file in the directory
i = 0
for filename in os.listdir(dir_path):
    i += 1
    if i > 5:
        break
    if filename.endswith(".md"):
        # Open the file
        with open(os.path.join(dir_path, filename), "r", encoding='utf-8') as file:
            # Read the file
            content = file.read()
            # Append the content to the list
            texts.append(content)

print(len(texts))


5


# bottom up model and extract funtion

In [34]:
# pydantic model
from pydantic import BaseModel, Field
from typing import List, Optional

class Tag(BaseModel):
    """Represents a meaningful tag for a product description, useful for search navigation."""
    name: str = Field(
        ..., 
        description="The name of the tag, representing a key feature or attribute of the product which should help users find it."
    )    
    
    # synonyms: Optional[List[str]] = Field(
    #     default_factory=list,
    #     description="A list of synonyms or related terms that can be used interchangeably with this tag."
    # )
    relevance_score: Optional[float] = Field(
        ..., 
        description="An optional score representing the importance or relevance of this tag to the product, typically ranging from 0 to 1."
    )
    
class Tags(BaseModel):
    title: str = Field(
        None ,
        description="Booke title if available"
    )
    """A collection of tags that can be used to describe a product."""
    tags: List[Tag] = Field(
        ..., 
        description="A list of tags that can be used to describe a product."
    )

def get_messages(data):
    messages_investor_info = [
        {"role": "user", "content": data},
    ]
    return messages_investor_info

import instructor
from openai import OpenAI

client = instructor.from_openai(OpenAI())

def extract_tags(content: str , extr_model: BaseModel):
    messages = get_messages(content)
    # Add a system prompt to guide the model's behavior
    system_prompt = {"role": "system", "content": 
                     "You are a tagging algorithm designed to analyze product descriptions and generate meaningful, categorized tags that improve search navigation and filtering. Each tag should represent a key feature, attribute, or characteristic of the product. Input: "
                     }
    
    # Add the system prompt at the beginning of the messages list
    messages.insert(0, system_prompt)

    # Extract structured data from natural language
    tags, completion = client.chat.completions.create_with_completion(
        temperature=0.0,
        model=MODEL,
        response_model=extr_model, #InvestorInfo,
        messages=messages,
        max_retries=2
    )

    print(f'costs prompt: {completion.usage.prompt_tokens}, completion: {completion.usage.completion_tokens}')
    return tags, completion.usage

# verticals model

In [52]:
from pydantic import BaseModel, Field
from typing import List, Optional


class TagsV(BaseModel):
    """A collection of tags that can be used to describe a product, along with additional metadata."""
    
    title: Optional[str] = Field(
        None,
        description="Book title if available"
    )
    product_type: str = Field(
        ..., 
        description="Type of the product (e.g., eBook, course, template)"
    )
    target_audience: str = Field(
        ..., 
        description="Target audience for the product (e.g., beginner, intermediate, advanced)"
    )

    main_content_category: str = Field(
        ..., 
        description="Main category of the content (e.g., education, art, gaming)"
    )
    content_subcategory: Optional[str] = Field(
        None, 
        description="Subcategory within the main content category (e.g., digital_art, tabletop_gaming)"
    )
    age_group: Optional[str] = Field(
        None, 
        description="Age group targeted by the product only if the group is NOT adults (e.g., children, teenagers, 5th_grade)"
    )
    tags: List[str] = Field(
        ..., 
        description="A list of additional tags that can be used to describe a product."
    )




ValidationError: 6 validation errors for TagsV
product_type
  Input should be a valid string [type=string_type, input_value=TagV(name='eBook', relevance_score=0.9), input_type=TagV]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
target_audience
  Input should be a valid string [type=string_type, input_value=TagV(name='beginner', relevance_score=0.9), input_type=TagV]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
content_focus
  Input should be a valid string [type=string_type, input_value=TagV(name='education', relevance_score=0.9), input_type=TagV]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
main_content_category
  Input should be a valid string [type=string_type, input_value=TagV(name='education', relevance_score=0.9), input_type=TagV]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
content_subcategory
  Input should be a valid string [type=string_type, input_value=TagV(name='math_education', relevance_score=0.9), input_type=TagV]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type
age_group
  Input should be a valid string [type=string_type, input_value=TagV(name='teenagers', relevance_score=0.9), input_type=TagV]
    For further information visit https://errors.pydantic.dev/2.7/v/string_type

# process

In [45]:
text = texts[1]
tags, usage = extract_tags(text, Tags)
json_data = tags.model_dump_json(indent=4)
print(json_data)
#print(tags)
print(usage)

costs prompt: 702, completion: 143
{
    "title": "Essential Math for Data Science | The Book",
    "tags": [
        {
            "name": "Data Science",
            "relevance_score": 1.0
        },
        {
            "name": "Machine Learning",
            "relevance_score": 1.0
        },
        {
            "name": "Mathematics",
            "relevance_score": 1.0
        },
        {
            "name": "Python",
            "relevance_score": 0.8
        },
        {
            "name": "Numpy",
            "relevance_score": 0.7
        },
        {
            "name": "Beginner Friendly",
            "relevance_score": 0.9
        },
        {
            "name": "Hands-on Projects",
            "relevance_score": 0.8
        },
        {
            "name": "No Jargon",
            "relevance_score": 0.6
        },
        {
            "name": "Skill Development",
            "relevance_score": 0.9
        },
        {
            "name": "Visualizations",
            

# process with verticals model

In [53]:
ttt = '''
my search 3d printing instruction earns no results
I find this with "3d printing"
 
Search products




Empires of Man - Liongryph Heavy Knights (STL only)
€8
Excellent Minis
5 ratings
You get the stl-files for a complete unit of Human Heavy Cavalry regiment riding on wingless liongryphins (intended for 10-15 mm wargaming). 

This collection contains

-6 unique Miniatures of Knights including a Command Group consisting of an Officer, Musician and Banner (supported and unsupported files included)

The miniatures are supposed to print flat on the print plate.

Happy printing!

Important: By purchasing these files, you agree that you may only use the content provided for personal use, and you are not allowed to distribute or sell the files. You also agree not to sell physical copies.

Physical Copies, printed in ultra detailed and tough resin can only be purchased here:

https://www.excellentminiatures.com/miniatures/epic-scale/warmeister-10mm-miniatures-on-strips/empires-of-man/659/empires-of-man-demigryph-knights



Permission is given to modify the files for your own personal use but you are not permitted to share those derivatives in any way. No permission is given to redistribute parts of the files in any way.

STLs contain parts used under licence by Tartessos Miniatures

Prints best in TGM-7 tough and detailed resin, available here:
https://www.excellentminiatures.com/resin-liquid-for-3d-printers/



Thank you for your support!

Add to cart
Add to wishlist

Ratings
5
(5 ratings)
5 stars

100%
4 stars

0%
3 stars

0%
2 stars

0%
1 star

0%
"Absolutely love them. Gorgeous models! "


Anonymous
2 months ago

provide relevant useful tags and tell if it should be found with the query "3d printing instruction"
'''

In [54]:
text = ttt # texts[3]
tags, usage = extract_tags(text, TagsV)
json_data = tags.model_dump_json(indent=4)
print(json_data)
#print(tags)
print(usage)

costs prompt: 819, completion: 187
{
    "title": "Empires of Man - Liongryph Heavy Knights (STL only)",
    "product_type": "STL files",
    "target_audience": "wargamers",
    "content_focus": "3D printing",
    "main_content_category": "gaming",
    "content_subcategory": "miniatures",
    "age_group": null,
    "tags": [
        {
            "name": "3D printing",
            "relevance_score": 1.0
        },
        {
            "name": "STL files",
            "relevance_score": 1.0
        },
        {
            "name": "miniatures",
            "relevance_score": 1.0
        },
        {
            "name": "wargaming",
            "relevance_score": 1.0
        },
        {
            "name": "liongryph knights",
            "relevance_score": 0.8
        },
        {
            "name": "human cavalry",
            "relevance_score": 0.8
        },
        {
            "name": "personal use",
            "relevance_score": 0.5
        },
        {
            "name": "r

In [18]:
# print json
json_data = tags.model_dump_json(indent=4)
print(json_data)

{
    "title": "A Programmer's Introduction to Mathematics: Second Edition (pdf)",
    "tags": [
        {
            "name": "ebook",
            "relevance_score": 0.9
        },
        {
            "name": "mathematics",
            "relevance_score": 0.8
        },
        {
            "name": "programming",
            "relevance_score": 0.7
        },
        {
            "name": "graphs",
            "relevance_score": 0.6
        },
        {
            "name": "calculus",
            "relevance_score": 0.6
        },
        {
            "name": "linear algebra",
            "relevance_score": 0.6
        },
        {
            "name": "eigenvalues",
            "relevance_score": 0.6
        },
        {
            "name": "optimization",
            "relevance_score": 0.6
        },
        {
            "name": "cultural attitudes",
            "relevance_score": 0.5
        },
        {
            "name": "proofs",
            "relevance_score": 0.5
        },
 

In [19]:
t2 = texts[1]
tags2, usage2 = extract_tags(t2, Tags)
#print(tags2)
print(usage2)
# print json
json_data = tags2.model_dump_json(indent=4)
print(json_data)

costs prompt: 725, completion: 108
CompletionUsage(completion_tokens=108, prompt_tokens=725, total_tokens=833)
{
    "title": "Essential Math for Data Science | The Book",
    "tags": [
        {
            "name": "mathematics",
            "relevance_score": 0.9
        },
        {
            "name": "data science",
            "relevance_score": 0.8
        },
        {
            "name": "machine learning",
            "relevance_score": 0.7
        },
        {
            "name": "Python",
            "relevance_score": 0.6
        },
        {
            "name": "beginner-friendly",
            "relevance_score": 0.5
        },
        {
            "name": "high school math",
            "relevance_score": 0.4
        },
        {
            "name": "Numpy",
            "relevance_score": 0.3
        }
    ]
}


In [20]:
t3 = texts[2]
tags3, usage3 = extract_tags(t3, Tags)
#print(tags3)
print(usage3)
# print json
json_data = tags3.model_dump_json(indent=4)
print(json_data)

costs prompt: 2133, completion: 148
CompletionUsage(completion_tokens=148, prompt_tokens=2133, total_tokens=2281)
{
    "title": "Data Analytics Roadmap - Become a data analyst in 2024",
    "tags": [
        {
            "name": "Data Analyst",
            "relevance_score": 0.9
        },
        {
            "name": "Career Guidance",
            "relevance_score": 0.8
        },
        {
            "name": "Data Visualization",
            "relevance_score": 0.7
        },
        {
            "name": "SQL",
            "relevance_score": 0.7
        },
        {
            "name": "Python",
            "relevance_score": 0.7
        },
        {
            "name": "Excel",
            "relevance_score": 0.6
        },
        {
            "name": "A/B Testing",
            "relevance_score": 0.6
        },
        {
            "name": "Learning Resources",
            "relevance_score": 0.5
        },
        {
            "name": "Skill Development",
            "relevan

# test 4o mini

In [25]:
t3 = texts[2]
tags3, usage3 = extract_tags(t3, Tags)
#print(tags3)
print(usage3)
# print json
json_data = tags3.model_dump_json(indent=4)
print(json_data)

costs prompt: 2112, completion: 130
CompletionUsage(completion_tokens=130, prompt_tokens=2112, total_tokens=2242)
{
    "title": "Data Analytics Roadmap - Become a data analyst in 2024",
    "tags": [
        {
            "name": "Data Analysis",
            "relevance_score": null
        },
        {
            "name": "Career Development",
            "relevance_score": null
        },
        {
            "name": "Data Analyst Roadmap",
            "relevance_score": null
        },
        {
            "name": "Excel",
            "relevance_score": null
        },
        {
            "name": "Data Visualization",
            "relevance_score": null
        },
        {
            "name": "Power BI",
            "relevance_score": null
        },
        {
            "name": "Tableau",
            "relevance_score": null
        },
        {
            "name": "SQL",
            "relevance_score": null
        },
        {
            "name": "Python",
            "releva