In [1]:
import os
import yaml
import argparse
import pandas as pd
from tqdm import tqdm
from datasets import Dataset
from transformers import pipeline
from sentence_transformers import SentenceTransformer

from utils.data_utils import load_data, preprocess_data, create_qa_data
from utils.model_utils import initialize_model, create_vector_store, create_qa_chain


from torch.utils.data import DataLoader


# 병렬 처리 비활성화
os.environ["TOKENIZERS_PARALLELISM"] = "false"



  from .autonotebook import tqdm as notebook_tqdm


In [1]:
import site
print(site.getsitepackages())

['/usr/local/lib/python3.10/site-packages']


In [2]:
def arg_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument("--cfg-path", type=str, default=None)
    return parser.parse_args()


def load_config(cfg_path):
    with open(cfg_path, "r") as f:
        return yaml.safe_load(f)

In [10]:
cfg = load_config('./configs/jj.yaml')
train_data_path = cfg["paths"]["train_data"]
test_data_path = cfg["paths"]["test_data"]
submission_path = cfg["paths"]["submission"]
output_path = cfg["paths"]["output"]
model_name = cfg["model"]["model_name"]
model_path = cfg["model"]["model_path"]
embedding_model_name = cfg["model"]["embedding_model"]
prompt_template = cfg["prompt_template"]
batch_size = cfg["settings"]["batch_size"]

# load data
train_df, test_df = load_data(train_data_path, test_data_path)
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
train_data = create_qa_data(train_df, is_train=True)
test_data = create_qa_data(test_df, is_train=False)
print(batch_size)

1


In [4]:
# Import model
tokenizer, model = initialize_model(model_name, model_path)

Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.86s/it]


In [5]:
# Create vector store
vector_store = create_vector_store(train_data, embedding_model_name)

In [11]:
# Generate RAG chain
qa_chain = create_qa_chain(vector_store, model, tokenizer, prompt_template, batch_size)

Device set to use cuda:0


In [12]:
class QADataSet(Dataset):
    def __init__(self, test_data):
        self.test_data = test_data
    
    def __len__(self):
        return len(self.test_data)
    
    def __getitem__(self, idx):
        # 단순히 질문을 가져오는 것만 수행합니다
        
        questions = self.test_data[idx]['question']
        return {"query": questions}


In [13]:
# Batch processing
test_dataset = Dataset.from_pandas(test_data)
# DataLoader 생성
qa_dataset = QADataSet(test_dataset)
qa_dataloader = DataLoader(qa_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [16]:
print("테스트 실행 시작... 총 테스트 샘플 수:", len(test_data))
test_results = []
t=0
# 이제 DataLoader를 사용해 배치 단위로 데이터를 처리합니다
for batch in tqdm(qa_dataloader, desc="Processing"):
    question = [{"query": row} for row in batch['query']]
    batch_results = [res["result"] for res in qa_chain.batch(question)]
    test_results += batch_results
    if t>=1:
        break
    else:
        t+=1

테스트 실행 시작... 총 테스트 샘플 수: 964


Processing:   0%|          | 1/964 [00:41<10:58:19, 41.02s/it]
