## Query Construction

This is the process of taking Natural language and converting it into domain language for the appropriate source.
Ex:
- **Relational databases:** (Text to Cypher) Use Natural Language to SQL.
- **GraphDBs:** (Text to Cypher) Natural language to Cypher query language for GraphDBs.
- **VectorDBs:** (Self query retriever) Auto generate metadata filters from query.

#### Example: 
Looking up videos that were published in a certain time period for a particular topic.


In [None]:
! pip -q install langchain_community tiktoken langchain-deepseek langchainhub chromadb langchain dotenv bs4 langchain-text-splitters langchain-ollama youtube_transcript_api pytube

In [None]:
# Setup

import os
from dotenv import load_dotenv
from langchain_ollama import OllamaEmbeddings
from langchain_deepseek import ChatDeepSeek

load_dotenv()

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')

# Loading my LLM API Key

EMBEDDING_MODEL_NAME = "qwen3-embedding:0.6b"
DEEPSEEK_MODEL_NAME='deepseek-chat'

OLLAMA_EMBEDDING = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME)
DEEPSEEK_LLM = ChatDeepSeek(model=DEEPSEEK_MODEL_NAME, temperature=0, api_key=os.getenv('DEEPSEEK_API_KEY'))


In [None]:
from langchain_community.document_loaders import YoutubeLoader

docs = YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=U1XKa8NCAOk", add_video_info=True
).load()

docs[0].metadata

In [None]:
from datetime import date
from typing import Literal, Optional, Tuple
from pydantic import BaseModel, Field

class TutorialSearch(BaseModel):
    """
    Search over a database of tutorial videos about a software library
    """

    content_search: str = Field(
        ...,
        description="Similarity search query applied to video transcripts.",
    )

    title_search: str = Field(
        ...,
        description=(
            "Alternate version of the content search query to apply to video titles."
            "Should be succinct and only include key words that could be in a video title"
        )
    )

    min_video_count: Optional[int] = Field(
        None,
        description="Minimum view count filter, inclusive. Only use if explicitly specified."
    )

    max_view_count: Optional[int] = Field(
        None,
        description="Maximum view count filter, exclusive. Only use if explicitly specified."
    )

    earliest_publish_date: Optional[date] = Field(
        None,
        description="Earliest publish date filter, inclusive. Only use if explicitly specified.",
    )

    latest_publish_date: Optional[date] = Field(
        None,
        description="Latest publish date filter, inclusive. Only use if explicitly specified.",
    )

    max_length_sec: Optional[int] = Field(
        None,
        description="Maximum video length in seconds, inclusive. Only use if explicitly specified.",
    )

    min_length_sec: Optional[int] = Field(
        None,
        description="Minimum video length in seconds, inclusive. Only use if explicitly specified.",
    )
    
    def pretty_print(self) -> None:
        for field in self.__fields__:
            if getattr(self, field) is not None and getattr(self, field) != getattr(
                self.__fields__[field], "default", None
            ):
                print(f'{field}: {getattr(self, field)}')

In [None]:
from langchain_core.prompts import ChatPromptTemplate

system="""
You are an expert at converting user questions into database queries. \
You have access to a database of tutorial videos about a software library for building LLM-powered applications. \
Given a question, return a database query optimized to retrieve the most relevant results.

If there any acronyms or words you are not familiar with, do not try to rephrase them.
"""

prompt = ChatPromptTemplate.from_messages([
    ('system', system),
    ('human', '{question}')
])

structured_llm = DEEPSEEK_LLM.with_structured_output(TutorialSearch)

query_analyzer = prompt | structured_llm

In [None]:
query_analyzer.invoke({'question': 'rag from scratch'}).pretty_print()

In [None]:
query_analyzer.invoke({'question': 'videos on rag from scratch, in 2024, have a view count around 200, that are 30 minutes and above'}).pretty_print()