- 환경 변수 가져오기

In [52]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [53]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_KEY

'pcsk_3RAn2d_6bkVakuPSMkobvFRxMZLjKBcmNWLGBdiLCAH9wvaDR6xFNKjvbCh8MgG7RQSU7H'

- Pinecone 클라이언트 초기화

In [54]:
from pinecone import Pinecone, ServerlessSpec
pine = Pinecone(api_key=PINECONE_API_KEY)
# 동일 : pine = Pinecone(api_key=getenv('PINECONE_API_KEY'))

- 인덱스 생성(서버리스)

In [55]:
index_name = 'wiki'

In [8]:
pine.create_index(
  name=index_name,
  dimension=1536,
  metric="cosine",
  spec=ServerlessSpec(cloud='aws', region='us-east-1')
)

{
    "name": "wiki",
    "metric": "cosine",
    "host": "wiki-dksb1qh.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 1536,
    "deletion_protection": "disabled",
    "tags": null
}

- 임베딩 객체 생성

In [56]:
# 사용할 인덱스 가져오기
index = pine.Index(index_name)
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [57]:
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings(model='text-embedding-3-small')
# 임베딩 모델의 차원('dimension')과 생성한 인덱스의 dimension 이 같아야 합니다.

- 벡터DB 에 저장할 데이터셋 가져오기
  - 영어로 20231101 버전의 위키백과 600만개의 row 중 100개 
  - 데이터의 크기가 크면 임베딩 시간이 오래 걸림.

In [58]:
from datasets import load_dataset
# pip install datasets
# dataset = load_dataset("wikimedia/wikipedia", "20231101.en", split="train[:100]") = 7GB 전체를 다운로드
dataset = load_dataset("parquet", data_files=["train-03.parquet"])

FileNotFoundError: Unable to find 'C:/Class250615/llm_Workspace/D1112\train-03.parquet'

In [59]:
# iterable 한 Dataset 타입 리턴
print(dataset) # num_rows : 156289개

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 156289
    })
})


In [60]:
data = dataset['train'][0:100] # 여전히 dict
print(len(data)) # dict : 4개의 키 값.
print(data['text'][0])
# select() 는 dataset 함수
data = dataset["train"].select(range(100)) # dict 의 리스트
print(len(data)) # list의 크기 : 100

4
Niklas Hogner (born 29 September 1984 in Linköping, Sweden) is a Swedish figure skater. Until 2003, he competed as a singles skater, winning four Swedish junior national titles and competing at the World Junior Figure Skating Championships.

He switched to pair skating, teaming up with partner Angelika Pylkina in 2003. They were the first Swedish pairs team to compete internationally since 1962. They twice placed 5th at the World Junior Championships and won three bronze medals on the Junior Grand Prix circuit. They won the bronze medal at the 2006 Nebelhorn Trophy and won the Nordic Championships. They ended their partnership in 2007.

Programs 
(with Pylkina)

Results

Pair skating with Pylkina

Single skating

References

External links

 
 

1984 births
Living people
Sportspeople from Linköping
Swedish male single skaters
Swedish male pair skaters
100


In [61]:
print(data['text'][0])

Niklas Hogner (born 29 September 1984 in Linköping, Sweden) is a Swedish figure skater. Until 2003, he competed as a singles skater, winning four Swedish junior national titles and competing at the World Junior Figure Skating Championships.

He switched to pair skating, teaming up with partner Angelika Pylkina in 2003. They were the first Swedish pairs team to compete internationally since 1962. They twice placed 5th at the World Junior Championships and won three bronze medals on the Junior Grand Prix circuit. They won the bronze medal at the 2006 Nebelhorn Trophy and won the Nordic Championships. They ended their partnership in 2007.

Programs 
(with Pylkina)

Results

Pair skating with Pylkina

Single skating

References

External links

 
 

1984 births
Living people
Sportspeople from Linköping
Swedish male single skaters
Swedish male pair skaters


- 청크
  - splitter 객체를 생성해서 문자열을 나누기

In [62]:
from langchain_text_splitters import  RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
  chunk_size = 400,  # 텍스트 분할 크기
  chunk_overlap=20, # 분할할 텍스트의 중첩 크기
  length_function=len,
  separators = ['\n', ' '] # 분할할 때 사용할 텍스트(단어) 구분자
)

- 청킹 후 임베딩 -> 업서트(batch_size 크기만큼 수행)

In [63]:
from uuid import uuid4
import time

batch_size = 30
texts = []
metas = []
count = 0

# 데이터셋의 각 샘플에 대해 반복합니다.  features: ['id', 'url', 'title', 'text'],
for i,sample in enumerate(data):
    full_text = sample["text"] # Wikipedia 문서 텍스트 -> 청킹 후 임베딩
    metadata = {  # key 구성은 임의로 합니다. 청킹된 데이터의 소속을 구별
        'wiki_id': str(sample["id"]),  # Wikipedia 문서 ID
        'url': sample['url'],
        'title': sample["title"],  # Wikipedia 문서 제목
    }


    chunks = splitter.split_text(full_text)  # 텍스트를 청크로 분할합니다.
    print(len(chunks))  # full_text 를 몇개로 분할 했는지 확인

    # 각 청크에 대해 반복합니다.
    for i, chunk in enumerate(chunks):
        # 실제로 벡터db에 업서트할 record
        record = {
            'chunk_id': i,  # 청크 ID
            'head_chunk': chunk[:200],  # 전체 텍스트
            **metadata,  # 메타데이터 언패킹
        }

        texts.append(chunk)  # 청크(분할된 텍스트)를 텍스트 목록에 추가합니다.
        metas.append(record)  # 메타데이터를 메타데이터 목록에 추가합니다.

        count += 1  # 처리된 청크 수를 증가시킵니다.

        # batch_size만큼의 청크를 처리 : 임베딩 -> 업서트
        if count % batch_size == 0:
            # Pinecone 인덱스에 청크를 추가합니다.청크 text 갯수(batch_size) 만큼 레코드 uuid 만들기
            ids = [str(uuid4()) for _ in range(len(texts))]
            # 임베딩. 30개씩(batch_size).
            embeddings = embedding.embed_documents(texts)
            index.upsert(
                vectors=zip(ids, embeddings, metas),
                namespace="wiki-ns1")
            # 청크 목록과 메타데이터 목록을 비웁니다.
            texts = []
            metas = []
            # 1초 대기합니다.
            time.sleep(1)

4
12
8
8
4
1
4
13
1
6
7
14
6
2
3
13
8
1
9
21
22
6
11
9
7
19
4
4
13
12
42
8
14
5
35
5
6
8
7
3
2
3
2
7
2
7
21
9
18
3
15
4
9
3
6
1
11
4
9
2
81
6
2
34
6
6
2
6
18
14
73
14
14
10
2
34
7
17
7
3
7
8
4
7
5
10
23
6
3
40
6
10
10
16
3
12
11
23
9
6
