# csv Loader

In [2]:
!pip install -qU langchain



In [3]:
from langchain_community.document_loaders.csv_loader import CSVLoader

# CSV 로더 생성
loader = CSVLoader(file_path=r"C:\Users\eys63\Desktop\기타활동\2024\겨울방학\24Winter_Vacation\data\titanic.csv")

# 데이터 로드
docs = loader.load()

print(len(docs))
print(docs[0].metadata)


891
{'source': 'C:\\Users\\eys63\\Desktop\\기타활동\\2024\\겨울방학\\24Winter_Vacation\\data\\titanic.csv', 'row': 0}


In [4]:
print(docs[0].page_content)  

PassengerId: 1
Survived: 0
Pclass: 3
Name: Braund, Mr. Owen Harris
Sex: male
Age: 22
SibSp: 1
Parch: 0
Ticket: A/5 21171
Fare: 7.25
Cabin: 
Embarked: S


In [10]:
# 컬럼정보:
# PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked

# CSV 파일 경로
loader = CSVLoader(
    file_path=r"C:\Users\eys63\Desktop\기타활동\2024\겨울방학\24Winter_Vacation\data\titanic.csv",
    csv_args={
        "delimiter": ",",  # 구분자, 탭으로 나누어진 파일의 경우 '\t'
        "quotechar": '"',  # 인용 부호 문자
        "fieldnames": [
            "Passenger ID",
            "Survival (1: Survived, 0: Died)",
            "Passenger Class",
            "Name",
            "Sex",
            "Age",
            "Number of Siblings/Spouses Aboard",
            "Number of Parents/Children Aboard",
            "Ticket Number",
            "Fare",
            "Cabin",
            "Port of Embarkation",
        ],  # 필드 이름
    },
)

# 데이터 로드
docs = loader.load()

# 데이터 출력
print(docs[1].page_content)

Passenger ID: 1
Survival (1: Survived, 0: Died): 0
Passenger Class: 3
Name: Braund, Mr. Owen Harris
Sex: male
Age: 22
Number of Siblings/Spouses Aboard: 1
Number of Parents/Children Aboard: 0
Ticket Number: A/5 21171
Fare: 7.25
Cabin: 
Port of Embarkation: S


## Contents, Metadata

In [9]:
docs[0].metadata

{'source': 'C:\\Users\\eys63\\Desktop\\기타활동\\2024\\겨울방학\\24Winter_Vacation\\data\\titanic.csv',
 'row': 0}

In [8]:
docs[0].page_content

'PassengerId: 1\nSurvived: 0\nPclass: 3\nName: Braund, Mr. Owen Harris\nSex: male\nAge: 22\nSibSp: 1\nParch: 0\nTicket: A/5 21171\nFare: 7.25\nCabin: \nEmbarked: S'

## Export as XML

In [14]:
row = docs[0].page_content.split("\n")
row_str = "<row>"
for element in row:
    splitted_element = element.split(":")
    value = splitted_element[-1]
    col = ":".join(splitted_element[:-1])
    row_str += f"<{col}>{value.strip()}<{col}>"
row_str += "</row>"
print(row_str)

<row><Passenger ID>PassengerId<Passenger ID><Survival (1: Survived, 0: Died)>Survived<Survival (1: Survived, 0: Died)><Passenger Class>Pclass<Passenger Class><Name>Name<Name><Sex>Sex<Sex><Age>Age<Age><Number of Siblings/Spouses Aboard>SibSp<Number of Siblings/Spouses Aboard><Number of Parents/Children Aboard>Parch<Number of Parents/Children Aboard><Ticket Number>Ticket<Ticket Number><Fare>Fare<Fare><Cabin>Cabin<Cabin><Port of Embarkation>Embarked<Port of Embarkation></row>


## UnstructuredCSVLoader

In [16]:
!pip install -qU unstructured

  You can safely remove it manually.


In [20]:
from langchain_community.document_loaders.csv_loader import UnstructuredCSVLoader

# 비구조화 CSV 로더 인스턴스 생성
loader = UnstructuredCSVLoader(file_path=r"C:\Users\eys63\Desktop\기타활동\2024\겨울방학\24Winter_Vacation\data\titanic.csv", mode="elements")

# 문서 로드
docs = loader.load()

# 첫 번째 문서의 HTML 텍스트 메타데이터 출력
print(docs[0].metadata["text_as_html"][:1000])


<table><tr><td>PassengerId</td><td>Survived</td><td>Pclass</td><td>Name</td><td>Sex</td><td>Age</td><td>SibSp</td><td>Parch</td><td>Ticket</td><td>Fare</td><td>Cabin</td><td>Embarked</td></tr><tr><td>1</td><td>0</td><td>3</td><td>Braund, Mr. Owen Harris</td><td>male</td><td>22</td><td>1</td><td>0</td><td>A/5 21171</td><td>7.25</td><td/><td>S</td></tr><tr><td>2</td><td>1</td><td>1</td><td>Cumings, Mrs. John Bradley (Florence Briggs Thayer)</td><td>female</td><td>38</td><td>1</td><td>0</td><td>PC 17599</td><td>71.2833</td><td>C85</td><td>C</td></tr><tr><td>3</td><td>1</td><td>3</td><td>Heikkinen, Miss. Laina</td><td>female</td><td>26</td><td>0</td><td>0</td><td>STON/O2. 3101282</td><td>7.925</td><td/><td>S</td></tr><tr><td>4</td><td>1</td><td>1</td><td>Futrelle, Mrs. Jacques Heath (Lily May Peel)</td><td>female</td><td>35</td><td>1</td><td>0</td><td>113803</td><td>53.1</td><td>C123</td><td>S</td></tr><tr><td>5</td><td>0</td><td>3</td><td>Allen, Mr. William Henry</td><td>male</td><td>35</

## Dataframe Loader

In [22]:
import pandas as pd

# CSV 파일 읽기
df = pd.read_csv(r"C:\Users\eys63\Desktop\기타활동\2024\겨울방학\24Winter_Vacation\data\titanic.csv")

# 데이터프레임의 처음 다섯 행 조회
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
from langchain_community.document_loaders import DataFrameLoader

# 데이터 프레임 로더 설정, 페이지 내용 컬럼 지정
loader = DataFrameLoader(df, page_content_column="Sex") # 칼럼들 수정하면서 확인해보기

# 문서 로드
docs = loader.load()

# 데이터 출력
print(docs[0].page_content)

# 메타데이터 출력
print(docs[0].metadata)


male
{'PassengerId': 1, 'Survived': 0, 'Pclass': 3, 'Name': 'Braund, Mr. Owen Harris', 'Age': 22.0, 'SibSp': 1, 'Parch': 0, 'Ticket': 'A/5 21171', 'Fare': 7.25, 'Cabin': nan, 'Embarked': 'S'}


In [24]:
# 큰 테이블에 대한 지연 로딩, 전체 테이블을 메모리에 로드하지 않음
for row in loader.lazy_load():
    print(row)
    break  # 첫 행만 출력


page_content='Braund, Mr. Owen Harris' metadata={'PassengerId': 1, 'Survived': 0, 'Pclass': 3, 'Sex': 'male', 'Age': 22.0, 'SibSp': 1, 'Parch': 0, 'Ticket': 'A/5 21171', 'Fare': 7.25, 'Cabin': nan, 'Embarked': 'S'}
