In [4]:
import os
import json
from tqdm import tqdm
from tqdm import tqdm

import torch
from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
from transformers import LongformerTokenizer, LongformerModel

In [2]:
# GPU 설정
device = torch.device('cuda:7' if torch.cuda.is_available() else 'cpu')

In [3]:

# DPR 모델과 토크나이저 로드
dpr_context_tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
dpr_context_model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
dpr_context_model.to(device)

# Longformer 모델과 토크나이저 로드
longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
longformer_model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
longformer_model.to(device)
longformer_model.eval()

# 규칙 문장 벡터화 (DPR로 인코딩)
def encode_contexts_dpr(rules):
    inputs = dpr_context_tokenizer(rules, padding=True, truncation=True, max_length=512, return_tensors='pt')
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = dpr_context_model(**inputs)
    return outputs.pooler_output  # [CLS] 토큰의 벡터 반환

# 규칙 문장 벡터화 (Longformer로 인코딩)
def encode_contexts_longformer(rules, batch_size=4):
    encoded_embeddings = []
    for i in range(0, len(rules), batch_size):
        batch_rules = rules[i:i + batch_size]
        inputs = longformer_tokenizer(batch_rules, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = longformer_model(**inputs)
        encoded_embeddings.append(outputs.last_hidden_state[:, 0, :])  # [CLS] 토큰의 벡터 반환
    return torch.cat(encoded_embeddings, dim=0)

# rule 데이터 가져오기
data = 'data/aihub_rules_prev.json'
with open(data, 'r', encoding="UTF-8") as j:
    aihub_rule = json.load(j)
    
normal_rule = aihub_rule['normal']
abnormal_rule = aihub_rule['abnormal']
combined_rules = normal_rule + abnormal_rule

# DPR 임베딩 생성 및 저장
context_embeddings_dpr = encode_contexts_dpr(combined_rules)
torch.save(context_embeddings_dpr, 'context_embeddings_dpr.pt')

# Longformer 임베딩 생성 및 저장
context_embeddings_longformer = encode_contexts_longformer(combined_rules)
torch.save(context_embeddings_longformer, 'context_embeddings_longformer.pt')

print("임베딩 저장 완료")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

임베딩 저장 완료


In [5]:
# DPR 모델과 토크나이저 로드
dpr_question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
dpr_question_model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
dpr_question_model.to(device)

# Longformer 모델과 토크나이저 로드
longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
longformer_model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
longformer_model.to(device)
longformer_model.eval()

# 쿼리 문장 벡터화 (DPR로 인코딩)
def encode_query_dpr(query):
    inputs = dpr_question_tokenizer(query, return_tensors='pt', truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = dpr_question_model(**inputs)
    return outputs.pooler_output  # [CLS] 토큰의 벡터 반환

# 쿼리 문장 벡터화 (Longformer로 인코딩)
def encode_query_longformer(query):
    inputs = longformer_tokenizer(query, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = longformer_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # [CLS] 토큰의 벡터 반환

Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def compute_similarity(query_embedding, doc_embedding):
    # 쿼리와 문서 간 유사도 계산 (예시)
    return torch.dot(query_embedding.flatten(), doc_embedding.flatten()).item()

In [7]:
def find_rank_of_answer_in_dpr_results(final_indices_mapped, answer_index):
    try:
        return final_indices_mapped.index(answer_index) + 1
    except ValueError:
        return -1

In [8]:
# 저장된 임베딩 불러오기
context_embeddings_dpr = torch.load('context_embeddings_dpr.pt')
context_embeddings_longformer = torch.load('context_embeddings_longformer.pt')

In [9]:
# discreption 가져오기
root_path = '/data1/sliver/jwsuh/construction_dataset/aihub/llava/llava_image_result_with_obj'
files = sorted([i for i in os.listdir(root_path) if i.endswith('.json')])

In [10]:
# top-K 리스트
list_k = [1, 5, 10, 15, 20, 25, 30, 50]
dpr_save_right_index = {}
dpr_save_wrong_index = {}

In [11]:
# 최종 top-K 평가
for final_top_k in list_k:
    correct = 0
    dpr_save_right_index[final_top_k] = {}
    dpr_save_wrong_index[final_top_k] = {}
    ind = 0
    
    for file in tqdm(files):
        with open(os.path.join(root_path, file), 'r', encoding="UTF-8") as j:
            caption = json.load(j)
        query_embedding_dpr = encode_query_dpr(caption['outputs'])

        # 초기 유사성 계산 (상위 50개 추출)
        initial_top_k = 50
        initial_similarities = torch.matmul(query_embedding_dpr, context_embeddings_dpr.T).squeeze(0)
        initial_top_k_indices = torch.topk(initial_similarities, k=initial_top_k).indices

        # 초기 상위 50개 문서에 대해 Longformer로 re-ranking 수행
        re_ranked_embeddings = context_embeddings_longformer[initial_top_k_indices]
        
        # GPU 메모리 해제
        torch.cuda.empty_cache()

        # Re-ranked 유사성 계산 (상위 final_top_k 추출)
        query_embedding_longformer = encode_query_longformer(caption['outputs'])
        re_ranked_similarities = torch.matmul(query_embedding_longformer, re_ranked_embeddings.T).squeeze(0)
        final_top_k_indices = torch.topk(re_ranked_similarities, k=final_top_k).indices

        # 최종 상위 final_top_k개의 인덱스를 초기 상위 50개의 인덱스로 매핑
        final_indices_mapped = [initial_top_k_indices[i].item() for i in final_top_k_indices]
        saved_final_indices_mapped = [(initial_top_k_indices[i].item(), re_ranked_similarities[i].item()) for i in final_top_k_indices]
        
        answer = file.split('_')[2]
        if answer[0] == 'Y':
            answer_index = int(answer[2:]) - 1
        elif answer[0] == 'N':
            answer_index = int(answer[2:]) + 49

        rank = find_rank_of_answer_in_dpr_results(final_indices_mapped, answer_index)

        if rank <= final_top_k and rank != -1:
            correct += 1
            dpr_save_right_index[final_top_k][ind] = saved_final_indices_mapped
        else:
            dpr_save_wrong_index[final_top_k][ind] = saved_final_indices_mapped

        ind += 1

    print(f"Top-{final_top_k} accuracy:", correct / len(files))

100%|██████████| 943/943 [01:45<00:00,  8.93it/s]


Top-1 accuracy: 0.019088016967126194


100%|██████████| 943/943 [01:39<00:00,  9.52it/s]


Top-5 accuracy: 0.07529162248144221


100%|██████████| 943/943 [01:40<00:00,  9.41it/s]


Top-10 accuracy: 0.1474019088016967


100%|██████████| 943/943 [01:39<00:00,  9.44it/s]


Top-15 accuracy: 0.22905620360551432


100%|██████████| 943/943 [01:40<00:00,  9.42it/s]


Top-20 accuracy: 0.3329798515376458


100%|██████████| 943/943 [01:41<00:00,  9.33it/s]


Top-25 accuracy: 0.3944856839872747


100%|██████████| 943/943 [01:41<00:00,  9.29it/s]


Top-30 accuracy: 0.4750795334040297


100%|██████████| 943/943 [01:43<00:00,  9.14it/s]

Top-50 accuracy: 0.8197242841993637





In [14]:
from transformers import RobertaTokenizer, RobertaModel

# RoBERTa 모델과 토크나이저 로드
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaModel.from_pretrained('roberta-base')
roberta_model.to(device)
roberta_model.eval()

def encode_query_roberta(query):
    inputs = roberta_tokenizer(query, return_tensors='pt', truncation=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    return outputs.last_hidden_state[:, 0, :]  # [CLS] 토큰의 벡터 반환

def encode_contexts_roberta(rules, batch_size=4):
    encoded_embeddings = []
    for i in range(0, len(rules), batch_size):
        batch_rules = rules[i:i + batch_size]
        inputs = roberta_tokenizer(batch_rules, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = roberta_model(**inputs)
        encoded_embeddings.append(outputs.last_hidden_state[:, 0, :])  # [CLS] 토큰의 벡터 반환
    return torch.cat(encoded_embeddings, dim=0)

# RoBERTa 임베딩 생성 및 저장
context_embeddings_roberta = encode_contexts_roberta(combined_rules)
torch.save(context_embeddings_roberta, 'context_embeddings_roberta.pt')

print("임베딩 저장 완료")

context_embeddings_roberta = torch.load('context_embeddings_roberta.pt')

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


임베딩 저장 완료


In [16]:
# top-K 리스트
list_k = [1, 5, 10, 15, 20, 25, 30, 50]
dpr_roberta_save_right_index = {}
dpr_roberta_save_wrong_index = {}

# 최종 top-K 평가
for final_top_k in list_k:
    correct = 0
    dpr_roberta_save_right_index[final_top_k] = {}
    dpr_roberta_save_wrong_index[final_top_k] = {}
    ind = 0
    
    for file in tqdm(files):
        with open(os.path.join(root_path, file), 'r', encoding="UTF-8") as j:
            caption = json.load(j)
        query_embedding_dpr = encode_query_dpr(caption['outputs'])
        query_embedding_roberta = encode_query_roberta(caption['outputs'])

        # 초기 유사성 계산 (상위 50개 추출)
        initial_top_k = 50
        initial_similarities = torch.matmul(query_embedding_dpr, context_embeddings_dpr.T).squeeze(0)
        initial_top_k_indices = torch.topk(initial_similarities, k=initial_top_k).indices

        # 초기 상위 50개 문서에 대해 RoBERTa로 re-ranking 수행
        re_ranked_embeddings_roberta = context_embeddings_roberta[initial_top_k_indices]
        
        # GPU 메모리 해제
        torch.cuda.empty_cache()

        # Re-ranked 유사성 계산 (상위 final_top_k 추출)
        re_ranked_similarities_roberta = torch.matmul(query_embedding_roberta, re_ranked_embeddings_roberta.T).squeeze(0)
        final_top_k_indices = torch.topk(re_ranked_similarities_roberta, k=final_top_k).indices

        # 최종 상위 final_top_k개의 인덱스를 초기 상위 50개의 인덱스로 매핑
        final_indices_mapped = [initial_top_k_indices[i].item() for i in final_top_k_indices]
        saved_final_indices_mapped = [(initial_top_k_indices[i].item(), re_ranked_similarities_roberta[i].item()) for i in final_top_k_indices]
        
        answer = file.split('_')[2]
        if answer[0] == 'Y':
            answer_index = int(answer[2:]) - 1
        elif answer[0] == 'N':
            answer_index = int(answer[2:]) + 49

        rank = find_rank_of_answer_in_dpr_results(final_indices_mapped, answer_index)

        if rank <= final_top_k and rank != -1:
            correct += 1
            dpr_roberta_save_right_index[final_top_k][ind] = saved_final_indices_mapped
        else:
            dpr_roberta_save_wrong_index[final_top_k][ind] = saved_final_indices_mapped

        ind += 1

    print(f"Top-{final_top_k} accuracy:", correct / len(files))

  0%|          | 0/943 [00:00<?, ?it/s]

100%|██████████| 943/943 [01:00<00:00, 15.58it/s]


Top-1 accuracy: 0.018027571580063628


100%|██████████| 943/943 [01:00<00:00, 15.71it/s]


Top-5 accuracy: 0.08483563096500531


100%|██████████| 943/943 [01:00<00:00, 15.59it/s]


Top-10 accuracy: 0.1728525980911983


100%|██████████| 943/943 [01:00<00:00, 15.50it/s]


Top-15 accuracy: 0.256627783669141


100%|██████████| 943/943 [01:01<00:00, 15.40it/s]


Top-20 accuracy: 0.3329798515376458


100%|██████████| 943/943 [01:02<00:00, 15.19it/s]


Top-25 accuracy: 0.3997879109225875


100%|██████████| 943/943 [01:02<00:00, 15.16it/s]


Top-30 accuracy: 0.49522799575821846


100%|██████████| 943/943 [01:03<00:00, 14.80it/s]

Top-50 accuracy: 0.8197242841993637



