# RAG 기초

## Enviornment (.env)

In [None]:
# .env 파일을 불러옵니다.
from dotenv import load_dotenv
load_dotenv()

## Evaluation Dataset Generation

#### 1) (pydantic) Schema

In [None]:
from typing import Optional
from langchain_core.pydantic_v1 import BaseModel, Field

class Car(BaseModel):
    """Information about a car."""
    make: Optional[str] = Field(default=None, description="The make of the car")
    model_name: Optional[str] = Field(default=None, description="The model name of the car")
    model_year: Optional[int] = Field(
        default=None, description="The year the car model was manufactured"
    )
    color: Optional[str] = Field(default=None, description="The color of the car")
    price: Optional[float] = Field(default=None, description="The price of the car")
    mileage: Optional[float] = Field(default=None, description="The mileage of the car")


#### 2) Synthetic Data

https://python.langchain.com/v0.2/docs/tutorials/data_generation/

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI


examples = [
    {
        "example": """make: 현대, model_name: 소나타, model_year: 2022, color: 흰색, price: 25000000, mileage: 15000.0"""
    },
    {
        "example": """make: 기아, model_name: K5, model_year: 2021, color: None, price: 23000000, mileage: 20000.0"""
    },
]


OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate( 
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

synthetic_data_generator = create_openai_data_generator(
    output_schema=Car,
    llm=ChatOpenAI(
        model="gpt-3.5-turbo-0125",
        temperature=0.7,
    ), 
    prompt=prompt_template,
)

synthetic_results = synthetic_data_generator.generate(
    subject="car data",
    extra="Use Korean language. Make it something you wouldn't normally choose. Around 30 percent of the values should be None at random. ",
    runs=10,
)

len(synthetic_results)

In [None]:
SYNTHETIC_FEW_SHOT_PREFIX

In [None]:
SYNTHETIC_FEW_SHOT_SUFFIX

In [None]:
synthetic_results

In [None]:
import pandas as pd

car_dicts = [car.dict() for car in synthetic_results]

df = pd.DataFrame(car_dicts)
df.to_csv("car_data.csv", index=False)
df

In [None]:
from langchain_experimental.synthetic_data import DatasetGenerator

# Dataset Generator
model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.7)
generator = DatasetGenerator(model, {"style": "informal", "minimal length": 300, "language": "Korean"})
dataset = generator(synthetic_results)

len(dataset)

In [None]:
dataset[0]

In [None]:
from langchain_community.chat_models import ChatOllama

model2 = ChatOllama(model='qwen2', temperature=0.7)
generator2 = DatasetGenerator(model2, {"style": "informal", "minimal length": 300, "language": "Korean"})
dataset2 = generator2(synthetic_results)

len(dataset2)

In [None]:
dataset2[0]

## Extraction 

#### 1) Prompt

In [None]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        ("human", "{text}"),
    ]
)

#### 2) OpenAI

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

extract_chain = prompt | llm.with_structured_output(schema=Car)

extract_result = extract_chain.invoke({"text": dataset[0]['text']})

extract_result

## Evaluation

In [None]:
extract_result == dataset[0]['fields']

In [None]:
extract_result = extract_chain.invoke({"text": dataset[1]['text']})

extract_result

In [None]:
extract_result == dataset[1]['fields']

## Ollama

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain_experimental.llms.ollama_functions import OllamaFunctions

prompt = PromptTemplate.from_template(
    """<|start_header_id|>system<|end_header_id|>
You are an expert extraction algorithm. Only extract relevant information from the text.
If you do not know the value of an attribute asked to extract, return null for the attribute's value.
<|eot_id|><|start_header_id|>user<|end_header_id|>

TEXT: {text}
JSON:
<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
"""
)

llm = OllamaFunctions(model="llama3", format="json")

extract_chain = prompt | llm.with_structured_output(schema=Car)

extract_result = extract_chain.invoke({"text": dataset[0]['text']})

extract_result

In [None]:
extract_result == dataset[0]['fields']

In [None]:
dataset[0]['fields']