In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

In [2]:
import json

with open("../../data/documents.jsonl") as f:
    docs = [json.loads(line) for line in f]

# questions_from_contents.jsonl 파일은 Generate_questions_from_content.ipynb 를 실행하면 생성됨.
with open("questions_from_contents.jsonl") as f:
    qfcs = [json.loads(line) for line in f]

In [None]:
docs[0]

In [None]:
qfcs[0]

In [None]:
from sentence_transformers import InputExample
import random

max_c_idx = len(docs) - 1
train_samples = []
eval_samples = []

for i, qfc in enumerate(qfcs):
    is_for_train = True
    
    # 일부만 평가용으로 사용하자
    if i % 10 >= 7:
        is_for_train = False
    
    c_idx = qfc['docOffset']
    
    mc_idx = random.randint(0, max_c_idx)
    while mc_idx == c_idx:
        mc_idx = random.randint(0, max_c_idx)
    
    if is_for_train:
        train_samples.append(
            InputExample(texts=[qfc['question'], docs[c_idx]['content']], label=1)
        )
        train_samples.append(
            InputExample(texts=[qfc['question'], docs[mc_idx]['content']], label=0)
        )
    else:
        eval_samples.append(
            InputExample(texts=[qfc['question'], docs[c_idx]['content']], label=1)
        )
        eval_samples.append(
            InputExample(texts=[qfc['question'], docs[mc_idx]['content']], label=0)
        )
    
    # print(f"question: {qfc['question']}")
    # print(f"      c_idx: {c_idx}, content: {docs[c_idx]['content']}")
    # print(f"      mc_idx: {mc_idx}, content: {docs[mc_idx]['content']}")
    # break

In [None]:
print(len(train_samples), len(eval_samples))

In [None]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_model = CrossEncoder('klue/roberta-small', num_labels=1)

In [None]:
from sentence_transformers.cross_encoder.evaluation import CECorrelationEvaluator

ce_evaluator = CECorrelationEvaluator.from_input_examples(eval_samples)
ce_evaluator(cross_model)

In [None]:
from torch.utils.data import DataLoader

train_batch_size = 16
num_epochs = 2
model_save_path = 'output/training_qfc'

train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)

cross_model.fit(
    train_dataloader=train_dataloader,
    epochs=num_epochs,
    warmup_steps=100,
    output_path=model_save_path,
)

In [13]:
cross_model.save(model_save_path)

In [None]:
ce_evaluator(cross_model)

In [12]:
from huggingface_hub import login
from huggingface_hub import HfApi

login(token=os.getenv('HF_TOKEN'))
api = HfApi()
repo_id = 'klue-roberta-small-cross-encoder-temp'
api.create_repo(repo_id=repo_id)

api.upload_folder(
    folder_path=model_save_path,
    repo_id=f"Kerneld/{repo_id}",
    repo_type="model",
)