In [1]:
# API KEY Loading
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain_teddynote import logging

logging.langsmith("CH03-OutputParser")

LangSmith 추적을 시작합니다.
[프로젝트명]
CH03-OutputParser


# PandasDataFrameOutputParser

1. 출력 형식 정의: 
    - AI 모델의 텍스트 응답을 DataFrame 형태에 대하여 처리  
    - 테이블 형식의 데이터를 다루는 작업에 적합
2. 구조화된 출력: 
    - AI 모델 출력이 행과 열로 구성된 형식을 경우 
    - 결과 출력 형태 기반으로 추가 분석 및 시각화 작업이 바로 가능
3. 사용 편의성:
    - 모델 응답에서 데이터 프레임으로의 변환 과정 최소화 
    - Pandas라는 데이터 처리 기능과 결합

(참고) 데이터 구조화를 담당하는 파서로, 바로 시각화 처리까지는 처리하지 못함 
    

[Reference] https://python.langchain.com/v0.1/docs/modules/model_io/output_parsers/types/pandas_dataframe/

In [6]:
import pprint
from typing import Any, Dict
import pandas as pd
import seaborn as sns

from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PandasDataFrameOutputParser

In [7]:
# 데이터 불러오기
df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
# 모델 정의
model = ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo")

In [5]:
# 출력 포맷 정의
def format_parser_output(parser_output: Dict[str, Any]) -> None:
    for key in parser_output.keys():
        parser_output[key] = parser_output[key].to_dict()
    return pprint.PrettyPrinter(width=4, compact=True).pprint(parser_output)

In [8]:
# 파서 정의
output_parser = PandasDataFrameOutputParser(dataframe=df)
print(output_parser.get_format_instructions())

The output should be formatted as a string as the operation, followed by a colon, followed by the column or row to be queried on, followed by optional array parameters.
1. The column names are limited to the possible columns below.
2. Arrays must either be a comma-separated list of numbers formatted as [1,3,5], or it must be in range of numbers formatted as [0..4].
3. Remember that arrays are optional and not necessarily required.
4. If the column is not in the possible columns or the operation is not a valid Pandas DataFrame operation, return why it is invalid as a sentence starting with either "Invalid column" or "Invalid operation".

As an example, for the formats:
1. String "column:num_legs" is a well-formatted instance which gets the column num_legs, where num_legs is a possible column.
2. String "row:1" is a well-formatted instance which gets row 1.
3. String "column:num_legs[1,2]" is a well-formatted instance which gets the column num_legs for rows 1 and 2, where num_legs is a p

In [None]:
# 템플릿 정의
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{question}\n",
    input_variables=["question"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

# 체인 정의
chain = prompt | model | output_parser

## 기초통계량 확인

In [18]:
# 질의사항
question = "Pclass 컬럼의 기초 통계량을 알려주세요."


# 실행
answer = chain.invoke({"question": question})
print(answer)

{'describe': count    891.000000
mean       2.308642
std        0.836071
min        1.000000
25%        2.000000
50%        3.000000
75%        3.000000
max        3.000000
Name: pclass, dtype: float64}


## 데이터 조회

In [None]:
# 질의사항
question = "10번째 행(row)를 조회해 주세요"

# 실행
answer = chain.invoke({"question": question})
print(answer)

{'10': survived                 1
pclass                   3
sex                 female
age                    4.0
sibsp                    1
parch                    1
fare                  16.7
embarked                 S
class                Third
who                  child
adult_male           False
deck                     G
embark_town    Southampton
alive                  yes
alone                False
Name: 10, dtype: object}


In [21]:
df.iloc[10]

survived                 1
pclass                   3
sex                 female
age                    4.0
sibsp                    1
parch                    1
fare                  16.7
embarked                 S
class                Third
who                  child
adult_male           False
deck                     G
embark_town    Southampton
alive                  yes
alone                False
Name: 10, dtype: object

## 조건 추가

In [25]:
# 질의사항
question = "age 컬럼의 0번째 부터 10번째 행(row)의 평균을 구해 주세요."

# 실행
answer = chain.invoke({"question": question})
print(answer)

{'mean': 25.7}


In [29]:
df.iloc[0:11]["age"].mean()

25.7

-----
** End of Documents **