In [1]:
import json
import os
from tqdm import tqdm
import numpy as np
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

import torch

[nltk_data] Downloading package punkt to /home/sliver/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# GPU 설정
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
# 데이터 로드
data = 'data/aihub_rules_prev.json'
with open(data, 'r', encoding="UTF-8") as j:
    aihub_rule = json.load(j)

normal_rule = aihub_rule['normal']
abnormal_rule = aihub_rule['abnormal']
combined_rules = normal_rule + abnormal_rule

In [4]:
# 데이터 로드 및 평가
root_path = '/data1/sliver/jwsuh/construction_dataset/aihub/llava/llava_image_result_with_obj'
files = sorted([i for i in os.listdir(root_path) if i.endswith('.json')])

# top-K 리스트
list_k = [1, 5, 10, 15, 20, 25, 30, 50]
save_right_index = {}
save_wrong_index = {}

def find_rank_of_answer_in_results(final_indices, answer_index):
    try:
        return final_indices.index(answer_index) + 1
    except ValueError:
        return -1

In [5]:
def tokenize(text):
    return word_tokenize(text.lower())

tokenized_rules = [tokenize(rule) for rule in combined_rules]

# BM25 모델 생성
bm25 = BM25Okapi(tokenized_rules)

In [6]:
from transformers import BertTokenizer, BertForSequenceClassification
from tqdm import tqdm


# BERT Cross-encoder 모델과 토크나이저 로드
cross_encoder_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
cross_encoder_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=1
)
cross_encoder_model.to(device)
cross_encoder_model.eval()


# 유사도 계산 함수 (Cross-encoder 사용)
def calculate_similarity_cross_encoder(query, document):
    inputs = cross_encoder_tokenizer(query, document, return_tensors='pt', truncation=True, padding=True, max_length=512)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    with torch.no_grad():
        outputs = cross_encoder_model(**inputs)
    return outputs.logits.item()

def bm25_initial_search_and_rerank(query, top_k=50):
    tokenized_query = tokenize(query)
    initial_top_k_scores = bm25.get_scores(tokenized_query)
    initial_top_k_indices =  np.argsort(initial_top_k_scores)[::-1][:top_k]
    
    reranked_scores = [calculate_similarity_cross_encoder(query, combined_rules[i]) for i in initial_top_k_indices]
    reranked_indices = np.argsort(reranked_scores)[::-1]
    final_top_k_indices = [initial_top_k_indices[i] for i in reranked_indices]
    
    return final_top_k_indices

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 9.78 GiB total capacity; 416.48 MiB already allocated; 2.44 MiB free; 452.00 MiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
calculate_similarity_cross_encoder

In [7]:
for final_top_k in list_k:
    correct = 0
    save_right_index[final_top_k] = {}
    save_wrong_index[final_top_k] = {}
    
    for idx, file in enumerate(tqdm(files)):
        with open(os.path.join(root_path, file), 'r', encoding="UTF-8") as j:
            caption = json.load(j)
        
        final_top_k_indices = bm25_initial_search_and_rerank(caption['outputs'], top_k=50)[:final_top_k]
        
        answer = file.split('_')[2]
        if answer[0] == 'Y':
            answer_index = int(answer[2:]) - 1
        elif answer[0] == 'N':
            answer_index = int(answer[2:]) + 49
            
        rank = find_rank_of_answer_in_results(final_top_k_indices, answer_index)

        if rank <= final_top_k and rank != -1:
            correct += 1
            save_right_index[final_top_k][idx] = final_top_k_indices
        else:
            save_wrong_index[final_top_k][idx] = final_top_k_indices

    print(f"Top-{final_top_k} accuracy:", correct / len(files))

  0%|          | 0/943 [00:00<?, ?it/s]

  2%|▏         | 23/943 [00:50<29:17,  1.91s/it] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned f

Top-1 accuracy: 0.13149522799575822


  2%|▏         | 23/943 [00:43<28:41,  1.87s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned fo

Top-5 accuracy: 0.6394485683987274


  2%|▏         | 23/943 [00:44<29:35,  1.93s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned fo

Top-10 accuracy: 0.83881230116649


  2%|▏         | 23/943 [00:44<28:19,  1.85s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned fo

Top-15 accuracy: 0.887592788971368


  2%|▏         | 23/943 [00:45<31:13,  2.04s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned fo

Top-20 accuracy: 0.9034994697773064


  2%|▏         | 23/943 [00:47<31:00,  2.02s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned fo

Top-25 accuracy: 0.9162248144220573


  2%|▏         | 23/943 [00:40<26:22,  1.72s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned fo

Top-30 accuracy: 0.9257688229056203


  2%|▏         | 23/943 [00:42<29:04,  1.90s/it]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned fo

Top-50 accuracy: 0.9406150583244963





In [8]:
save_right_index[10]

{0: [63, 13, 55, 5, 92, 32, 42, 53, 46, 82],
 2: [63, 13, 5, 32, 55, 11, 82, 79, 29, 75],
 3: [63, 13, 5, 55, 58, 32, 56, 96, 66, 94],
 4: [63, 13, 5, 55, 56, 6, 53, 3, 75, 25],
 5: [13, 63, 5, 32, 55, 14, 82, 3, 53, 60],
 6: [63, 13, 5, 55, 32, 56, 6, 75, 60, 79],
 7: [63, 13, 5, 55, 75, 32, 56, 25, 66, 94],
 8: [64, 14, 41, 65, 91, 15, 50, 0, 94, 44],
 9: [14, 64, 41, 91, 15, 65, 12, 79, 32, 40],
 10: [41, 14, 91, 64, 65, 15, 78, 44, 28, 94],
 11: [14, 64, 41, 91, 15, 65, 29, 79, 94, 44],
 12: [14, 64, 41, 15, 65, 91, 28, 78, 79, 29],
 13: [14, 64, 91, 65, 41, 15, 78, 28, 29, 44],
 14: [14, 64, 15, 41, 91, 65, 90, 44, 40, 29],
 15: [14, 64, 41, 91, 65, 15, 60, 92, 78, 94],
 16: [14, 64, 91, 65, 41, 15, 78, 28, 44, 94],
 17: [14, 64, 41, 91, 65, 15, 44, 78, 28, 79],
 18: [14, 64, 41, 91, 65, 15, 78, 28, 79, 92],
 19: [14, 64, 65, 15, 41, 91, 78, 44, 79, 29],
 20: [64, 14, 41, 65, 91, 15, 79, 78, 42, 29],
 21: [64, 14, 65, 41, 91, 15, 78, 28, 79, 29],
 22: [14, 64, 91, 41, 65, 15, 28, 

In [13]:
len(combined_rules)

100

In [15]:
for key in save_wrong_index[10].keys():
    print("file name:", files[key])
    answer = files[key].split('_')[2]
    
    if answer[0] == 'Y':
        answer_index = int(answer[2:]) - 1
    elif answer[0] == 'N':
        answer_index = int(answer[2:]) + 49
    print(answer_index)
    for idx, value in enumerate(save_wrong_index[10][key]):
        print(f"rank-{idx+1:<2}", f"{combined_rules[value]:>4} {value} ")
    print("\n")

file name: H-220607_B16_Y-14_002_0029.json
13
rank-1  Working only on the top (or bottom) of the system scaffold 11 
rank-2  Simultaneous top and bottom work on system scaffold (two-person operation) 61 
rank-3  No fire extinguisher placement next to welding equipment 82 
rank-4  Material placement at the edge of a horse scaffold 92 
rank-5  Improper overloading of a Ladder Truck 66 
rank-6  Proper placement of materials and tools on a horse scaffold 42 
rank-7  Fire extinguisher placement next to welding equipment 32 
rank-8  rolling tower safety railing installation 6 
rank-9  No materials placed on the end of the formwork 14 
rank-10 Faulty concrete pump truck safety device installation 94 


file name: H-220609_A18_Y-04_001_0021.json
3
rank-1  Fire extinguisher placement next to welding equipment 32 
rank-2  No fire extinguisher placement next to welding equipment 82 
rank-3  Worker operating within the hazard radius of a dump truck 75 
rank-4  Improper overloading of a Ladder Truc