https://kerneld.tistory.com/32
위 문서를 참고해서 ColBERT 콘다 환경을 셋팅하자.

In [1]:
#!pip install openai

In [1]:
# 학습된 모델을 허깅페이스에 저장/로드 할지 여부.
is_use_hf_store_load_trained_model = True

if is_use_hf_store_load_trained_model:
    hf_repo_id = 'sentence-transformer-klue-temp'
    hf_full_repo_id = f"Kerneld/{hf_repo_id}"

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [3]:
import json

with open("../../data/chunked_documents_300.jsonl") as f:
    chunks = [json.loads(line) for line in f]

# questions_from_contents.jsonl 파일은 Generate_questions_from_content.ipynb 를 실행하면 생성됨.
#with open("questions_from_contents.jsonl") as f:       # 문서당 10개의 질문
#with open("questions_from_contents_20.jsonl") as f:     # 문서당 20개의 질문
with open("questions_from_chunks_5.jsonl") as f:     # 문서당 20개의 질문
    qfcs = [json.loads(line) for line in f]

In [4]:
# chunk 에서 줄바꿈 문자를 제거 (학습 할때 에러 방지)
for chunk in chunks:
    chunk['content'] = chunk['content'].replace("\n", "")
    chunk['content'] = chunk['content'].replace("\r", "")

In [None]:
print(chunks[0])
print(qfcs[0])

In [None]:
chunks_only_content = [chunk['content'] for chunk in chunks]
chunks_only_content[0]

In [None]:
qfcs_only_question = [qfc['question'] for qfc in qfcs]
qfcs_only_question[0]

In [8]:
# 필요한 라이브러리 import

import colbert
from colbert import Indexer, Searcher
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection

## 1. Indexing

빠른 실습을 위해 처음 2000개의 구절만 색인

In [9]:
# 기본적인 상수 정의config

nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300 # truncate passages at 300 tokens
experiment = 'ir_contest'

한국어 데이터에 대한 색인/검색

In [None]:
#checkpoint = 'colbert-ir/colbertv2.0'
checkpoint = 'hunkim/sentence-transformer-klue'
#checkpoint = "snunlp/KR-SBERT-V40K-klueNLI-augSTS"
#checkpoint = "google-bert/bert-base-multilingual-cased"
index_name = 'science_common_sense'

with Run().context(RunConfig(nranks = 1, experiment = experiment)):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(doc_maxlen = doc_maxlen, nbits = nbits, kmeans_niters = 4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                # Consider larger numbers for small datasets.
    indexer = Indexer(checkpoint = checkpoint, config = config)
    indexer.index(name = index_name, collection = chunks_only_content, overwrite = True)

In [11]:
def search_docs(query, experiment, index_name, collection, k):
    with Run().context(RunConfig(experiment = experiment)):
        searcher = Searcher(index = index_name, collection = collection)

    # Find the top-3 passages for this query
    results = searcher.search(query, k)
    
    ret = []

    # Print out the top-k retrieved passages
    for passage_id, passage_rank, passage_score in zip(*results):
        print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")
        
        dic = {
            'score': passage_score,
            'passage_id': passage_id,
            'content': searcher.collection[passage_id],
        }
        ret.append(dic)
    
    return ret

In [None]:
ret = search_docs('나무의 분류에 대해 조사해 보기 위한 방법은?', experiment, index_name, chunks_only_content, 3)
print(ret)

## 3. Training TODO

LLM을 활용한 생성데이터로 ColBERT를 학습한 후 더 나은 성능의 모델을 활용하는 방법 실습

In [None]:
import random

triples_data = []
max_c_idx = len(chunks_only_content) - 1

for q_idx in range(len(qfcs_only_question)):
    # 관련 있는 문서 idx
    c_idx = qfcs[q_idx]['chunkOffset']

    # 관련 없는 문서 idx 결정
    while True:
        mc_idx = random.randint(0, max_c_idx)
        
        if (c_idx - 5) <= mc_idx and mc_idx <= (c_idx + 5): continue
        
        break

    triples_data.append(f'{q_idx}, {c_idx}, {mc_idx}')
    print(f"({q_idx}, {c_idx}, {mc_idx}) question: {qfcs_only_question[q_idx]}, relevance doc: {chunks_only_content[c_idx]}, not relevance doc: {chunks_only_content[mc_idx]}")

print(triples_data)

In [14]:
# ColBERT 학습을 위하여 학습 데이터를 파일에 저장
collection_file = 'collection.tsv'
query_file = 'query.tsv'
triples_file = 'triples'

with open(collection_file, 'w') as f:
  for i,item in enumerate(chunks_only_content):
    f.write(f'{i}\t{item}\n')

with open(query_file, 'w') as f:
  for i,item in enumerate(qfcs_only_question):
    f.write(f'{i}\t{item}\n')

with open(triples_file, 'w') as f:
  for i,item in enumerate(triples_data):
    f.write(f'[{item}]\n')

### 3.2 새로 만든 데이터로 모델 학습

In [None]:
from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert import Trainer

if __name__=='__main__':
    with Run().context(RunConfig(nranks=1, experiment="training")):

        config = ColBERTConfig(
            bsize=24,
            root="./experiments",
        )

        trainer = Trainer(
            triples=triples_file,
            queries=query_file,
            collection=collection_file,
            config=config,
        )

        # Pretrained model을 한국어 기반 모델로 설정해 준다.
        checkpoint_path = trainer.train(checkpoint='hunkim/sentence-transformer-klue')
        #checkpoint_path = trainer.train(checkpoint="snunlp/KR-SBERT-V40K-klueNLI-augSTS")
        #checkpoint_path = trainer.train(checkpoint="google-bert/bert-base-multilingual-cased")
        #checkpoint_path = trainer.train()

        print(f"Saved checkpoint to {checkpoint_path}...")

In [None]:
# # 학습된 모델의 위치
!find experiments/training -name colbert

학습된 모델을 허깅페이스에 업로드

In [None]:
from huggingface_hub import login
from huggingface_hub import HfApi

if is_use_hf_store_load_trained_model:
    model_save_path = 'experiments/training/none/2024-10/22/10.44.21/checkpoints/colbert'
    
    login(token=os.getenv('HF_TOKEN'))
    
    api = HfApi()
    #api.create_repo(repo_id=hf_repo_id)

    api.upload_folder(
        folder_path=model_save_path,
        repo_id=hf_full_repo_id,
        repo_type="model",
    )

학습된 모델로 다시 색인 및 검색

In [None]:
# 위에서 확인한 학습된 모델의 위치를 checkpoint에 넣어줌
if is_use_hf_store_load_trained_model:
    checkpoint = hf_full_repo_id
else:
    checkpoint = 'experiments/training/none/2024-10/22/10.44.21/checkpoints/colbert'
    
experiment = 'after_trained'
index_name = 'science_common_sense'

with Run().context(RunConfig(nranks=1, experiment=experiment)):  # nranks specifies the number of GPUs to use
    config = ColBERTConfig(nbits=nbits, kmeans_niters=4) # kmeans_niters specifies the number of iterations of k-means clustering; 4 is a good and fast default.
                                                                                # Consider larger numbers for small datasets.
    indexer = Indexer(checkpoint = checkpoint, config = config)
    indexer.index(name = index_name, collection = chunks_only_content, overwrite=True)

In [None]:
ret = search_docs('나무의 분류에 대해 조사해 보기 위한 방법은?', experiment, index_name, chunks_only_content, 3)
print(ret)