In [None]:
#----------------------------------------------------------------------
# post로 search 하는 예시
#
# http://10.10.4.10:9000/search/?esindex=test3
#
# curl -X 'POST' \
#   'http://10.10.4.10:9000/search/?esindex=test3' \
#   -H 'accept: application/json' \
#   -H 'Content-Type: application/json' \
#   -d '{
#   "query": "사업계획서",
#   "search_size": 2
# }'
#----------------------------------------------------------------------
import requests
import json

FASTAPI_URL = "http://10.10.4.10:9000/search/"
ES_INDEX = 'test3'
SEARCH_SIZE = 5

def run_embedding_query_loop():
    while True:
        try:
            handle_query_embedding()
        except KeyboardInterrupt:
            return
        
def handle_query_embedding():
    #url
    url = FASTAPI_URL + "?esindex=" + ES_INDEX
    print(f'url:{url}')

    #headers
    headers = {
        "Content-Type": "application/json"
    }

    #data
    #query = "my name is bongsoo. who are you?"
    #query = "안녕하세요.지금은 curl 테스트 입니다."
    query = input("검색 문장 입력: ")

    # json 구조
    temp = {
        "query": query,
        "search_size": SEARCH_SIZE,
    }

    # dict 을 json으로 변환
    data = json.dumps(temp)
    
    try:
        response = requests.post(url, headers=headers, data=data)
    except requests.exceptions.Timeout as errd:
        print("Timeout Error:", errd)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Connection Error:", errc)
        return
    except requests.exceptions.HTTPError as errb:
        print("Http Error:", errb)
        return
    except requests.exceptions.RequestException as erra:#Any Error except
        print("AnyException Error:", erra)
        return
    
    print(type(response.text))
    print("response.text: ", response.text)
    print()
    
    
run_embedding_query_loop()

In [None]:
#----------------------------------------------------------------------
# get으로 search 하는 예시
#
# http://10.10.4.10:9000/search/test3?query=%EC%82%AC%EC%97%85%EA%B3%84%ED%9A%8D%EC%84%9C&search_size=3
#
# curl -X 'GET' \
#   'http://10.10.4.10:9000/search/test3?query=%EC%82%AC%EC%97%85%EA%B3%84%ED%9A%8D%EC%84%9C&search_size=3' \
#   -H 'accept: application/json'
#----------------------------------------------------------------------
import requests
import json

ES_INDEX = 'test3'
FASTAPI_URL = "http://10.10.4.10:9000/
SEARCH_SIZE = 5

def run_embedding_query_loop():
    while True:
        try:
            handle_query_embedding()
        except KeyboardInterrupt:
            return
        
def handle_query_embedding():
    #url
 
    #headers
    headers = {
        "Content-Type": "application/json"
    }

    #data
    #query = "my name is bongsoo. who are you?"
    #query = "안녕하세요.지금은 curl 테스트 입니다."
    query = input("검색 문장 입력: ")

    url = FASTAPI_URL + "es/"+ES_INDEX+"/docs?query=" + query + "&search_size=" + str(SEARCH_SIZE)
    print(f'url:{url}')
    
    try:
        response = requests.get(url, headers=headers)
    except requests.exceptions.Timeout as errd:
        print("Timeout Error:", errd)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Connection Error:", errc)
        return
    except requests.exceptions.HTTPError as errb:
        print("Http Error:", errb)
        return
    except requests.exceptions.RequestException as erra:#Any Error except
        print("AnyException Error:", erra)
        return
    
    print(type(response.text))
    print("response.text: ", response.text)
    print()
    
    
run_embedding_query_loop()

In [None]:
#----------------------------------------------------------------------
# post로 문서 임베딩 예시
#
# http://10.10.4.10:9000/embed/es/?esindex=test&createindex=false
#
# curl -X 'POST' \
#   'http://10.10.4.10:9000/embed/es/?esindex=test&createindex=false' \
#   -H 'accept: application/json' \
#   -H 'Content-Type: application/json' \
#   -d '{
#   "uids": [
#     "5"
#   ],
#   "titles": [
#     "경찰청장 한상균 소요죄적용 입장 변함없어"
#   ],
#   "documents": [
#     "강신명 경찰청장은 5일 한상균 민주노총 위원장에게 소요죄를 적용하지 않고 기소할 것으로 알려진 검찰의 방침과 관련..."
#   ]
# }
#----------------------------------------------------------------------
import numpy as np
import pandas as pd
import time
import os

from os import sys
sys.path.append('../')
from myutils import remove_reverse, getListOfFiles, clean_text
from tqdm.notebook import tqdm


OUT_FOLDER = '../../data11/mpower_doc/out/' # 추출된 TEXT 파일들이 있는 루트폴더

# OUT_FOLDER에 모든 파일 경로를 얻어옴.
file_paths = getListOfFiles(OUT_FOLDER)
assert len(file_paths) > 0 # files가 0이면 assert 발생
    
print('*file_count: {}, file_list:{}'.format(len(file_paths), file_paths[0:5]))

contexts = []
titles = []
contextids = []

# TEXT 추출된 파일들을 읽어오면서 제목(title), 내용(contexts) 등을 저장해 둠.
contextid = 1000
for idx, file_path in enumerate(tqdm(file_paths)):
    if '.ipynb_checkpoints' not in file_path:
        sentences = []
        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()
            
            #.PAGE:1 패턴을 가지는 문장은 제거함.
            pattern = r"\.\.PAGE:\d+\s?"
            data = clean_text(text=data, pattern=pattern)
            
            file_name = os.path.basename(file_path)  # 파일명만 뽑아냄
            
            #  filename = 5.보안사업부 사업계획.hwp.txt 이면 뒤에 hwp.txt는 제거하고 '5.보안사업부 사업계획' 문자열만 title로 저장함.
            file_name = remove_reverse(file_name, '.')# 5.보안사업부 사업계획.hwp 출력됨
            file_name = remove_reverse(file_name, '.')# 5.보안사업부 사업계획 출력됨
            
            contextid += 1
            contexts.append(data)     # 파일 내용 저장 
            titles.append(file_name)  # 파일명을 제목으로 저장(추후 쿼리할 문장이 됨)
            contextids.append(contextid) # contextid 저장 
 
# 데이터 프레임으로 만듬.
df_contexts = pd.DataFrame((zip(contexts, titles, contextids)), columns = ['context','question', 'contextid'])

print(f'*len(contexts): {len(contexts)}')

In [None]:
def index_batch(url:str, headers, docs):
    
    uids = []
    titles=[]
    documents=[]
    
    for doc in docs:
        uids.append(doc['uid'])
        titles.append(doc['title'])
        documents.append(doc['document'])
        
    #print(len(uids))
    print(f'uids:{uids}')
    
    temp = {
            "uids": uids,
            "titles": titles,
            "documents": documents,
        }
    
    data = json.dumps(temp)
    
    try:
        response = requests.post(url, headers=headers, data=data)
    except requests.exceptions.Timeout as errd:
        print("Timeout Error:", errd)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Connection Error:", errc)
        return
    except requests.exceptions.HTTPError as errb:
        print("Http Error:", errb)
        return
    except requests.exceptions.RequestException as erra:#Any Error except
        print("AnyException Error:", erra)
        return
    
    #print(type(response.text))
    print("responset: ", response)
    print()

In [None]:
import requests
import json
from tqdm.notebook import tqdm

FASTAPI_URL = "http://10.10.4.10:9000/es/mpower-kpf-128d-f16-avg/docs"
url = FASTAPI_URL
print(url)

#headers
headers = {
        "Content-Type": "application/json"
    }

uids = df_contexts['contextid'].values.tolist()
titles = df_contexts['question'].values.tolist()
documents = df_contexts['context'].values.tolist()

docs = []
batch_count = 0

start = time.time()
for uid, title, document in tqdm(zip(uids, titles, documents)):
    
    doc = {} #dict 선언
    doc['uid'] = uid      
    doc['title'] = title     
    doc['document'] = document
    docs.append(doc)
        
    batch_count += 1
    
    if batch_count % 10 == 0:
        index_batch(url=url, headers=headers, docs=docs)
        docs = []
        
if docs:
    index_batch(url=url, headers=headers, docs=docs)
    docs = []   
    
print(f'*임베딩 시간 : {time.time()-start:.4f}\n')

In [1]:
#------------------------------------------------------------
# FastAPI 임베딩 서버를 이용한 MPR 계산 예시
#------------------------------------------------------------

from elasticsearch import Elasticsearch
from elasticsearch import helpers
import pandas as pd

INDEX_NAME = 'mpower-kpf-128d-f16-variable'  # ES 인덱스 명 (*소문자로만 지정해야 함)

# elastic 서버 접속 
es = Elasticsearch("http://10.10.4.10:9200/")
es.info()

#------------------------------------------------------------
# 쿼리 df 만듬.
# => 인덱스내 데이터 조회 => query 이용해서 데이터 조회 후 쿼리 df 만듬
# 
# GET /index명/_search
#{
#  "_source": ["rfile_name","rfile_text"], 
#  "query": {
#    "match_all": {}
#  }
# }	
#-----------------------------------------------------------

def search(index_name, data=None, source:list=None):
    
    if data is None: #모든 데이터 조회
        data = {"match_all":{}}
    else:
        data = {"match": data}
    
    if source is None:
        body = {"query": data}
    else:
        body = {"_source":source, "query": data}
    
    #print(body)
    
    res = es.search(index=index_name, body=body)
    return res

rfile_list = []

'''
# match_all로 쿼리하면 10개만 출력됨. 따라서 아래 처럼 1개씩 쿼리하면서 rfile_name과 rfile_text 불러옴.
res = search(index_name=INDEX_NAME, source=["rfile_name","rfile_text"])

for hits in res['hits']['hits']:
    rfile_name = hits['_source']['rfile_name']
    rfile_text = hits['_source']['rfile_text']
    
    print(f'rfile_name:{rfile_name}, rfile_text:{rfile_text}')
    
    if rfile_name and rfile_text:
        docs = {}
        docs['rfile_name'] = rfile_name
        docs['rfile_text'] = rfile_text
            
    rfile_list.append(docs)
'''    
    

# 쿼리로 rfile_name 1001 부터 1252까지 쿼리하면서 rfile_name과 rfile_text 불러옴.
for i in range(253):
    contextid = 1000+i
    data = {'rfile_name': contextid}
    res=search(index_name=INDEX_NAME, data=data, source=["rfile_name","rfile_text"])

    for hits in res['hits']['hits']:
        rfile_name = hits['_source']['rfile_name']
        rfile_text = hits['_source']['rfile_text']
        
        if rfile_name and rfile_text:
            docs = {}
            docs['rfile_name'] = rfile_name
            docs['rfile_text'] = rfile_text
            
        rfile_list.append(docs)
        break


# 리스트를 불러와서 질의 dataframe 만듬
contextids = []
questions = []

for j, rfile in enumerate(rfile_list):
    rfile_name = rfile['rfile_name']
    rfile_text = rfile['rfile_text']
    
    contextids.append(rfile_name)
    questions.append(rfile_text)
    
    if j < 10:
        print(f'{rfile_name} : {rfile_text}')
 
# dataframe으로 만듬
df_questions = pd.DataFrame((zip(questions, contextids)), columns = ['question','contextid'])

  res = es.search(index=index_name, body=body)


1001 : Autotools를 이용한 리눅스 응용프로그램 빌드
1002 : Apache에서 Tomcat 사용하기
1003 : BERT 모델 STS 성능 비교분석
1004 : 클라우드서비스 구축 신청서
1005 : 클라우드서비스 보안인증제 운영 변경관리서
1006 : 회사 직원 급여관련 규정
1007 : BERT 검색 , 분류 시스템 성능 측정
1008 : 테이블 데이터 조회하는 방법
1009 : 클라우드_취약점_점검_가이드(up)
1010 : 회사 채용,승진,퇴사관련 인사규정


In [2]:
#-------------------------------------------------------------------------------------
# 쿼리 실행
#user_querys = ["독도에서 사고가 나서 실종자가 발생했다.", "오늘 날씨가 흐리고 비가 오겠다."]
#-------------------------------------------------------------------------------------
import time
import requests
import json
from tqdm.notebook import tqdm
from os import sys
sys.path.append('../')
from myutils import df_sampling

#--------------------------------------
# param
#--------------------------------------
FASTAPI_URL = "http://10.10.4.10:9000/"
SEARCH_SIZE = 5
QMETHOD = 1 # qmethod=0 혹은 1(0=max벡터 구하기, 1=평균벡터 구하기 (default=0))
#--------------------------------------


user_querys = df_questions['question'].values.tolist()

headers = {"Content-Type": "application/json"}

bi_predictions_list=[]

for j,user_query in enumerate(tqdm(user_querys)):
    url = FASTAPI_URL + "es/"+INDEX_NAME+"/docs?query=" + user_query + "&search_size=" + str(SEARCH_SIZE) + "&qmethod=" + str(QMETHOD)
    #print(f'url:{url}\n')
    
    try:
        response = requests.get(url, headers=headers)
    except requests.exceptions.Timeout as errd:
        print("Timeout Error:", errd)
    except requests.exceptions.ConnectionError as errc:
        print("Connection Error:", errc)
    except requests.exceptions.HTTPError as errb:
        print("Http Error:", errb)
    except requests.exceptions.RequestException as erra:#Any Error except
        print("AnyException Error:", erra)
    
    # 상태 코드 성공(200) 이면 
    status_code = response.status_code
    if status_code == 200:
        docs = response.json()['docs']
        
        rfilename = []
        for doc in docs:
            #print(doc)
            rfilename.append(doc['rfile_name'])
            
        # MPR 계산을 위해 예측검색리스트에 검색된 데이터 입력    
        bi_predictions_list.append(rfilename[0:SEARCH_SIZE])
        
    #print(response.text)
    #print(response.headers)
    
    #if j > 2:
    #    break

print(f'*len:{len(bi_predictions_list)}')
print(bi_predictions_list[0:10])




  0%|          | 0/252 [00:00<?, ?it/s]

*len:252
[[1001, 1083, 1087, 1121, 1250], [1002, 1059, 1250, 1220, 1048], [1064, 1075, 1043, 1024, 1007], [1186, 1180, 1122, 1004, 1103], [1005, 1234, 1233, 1186, 1151], [1006, 1163, 1164, 1136, 1010], [1233, 1237, 1228, 1158, 1041], [1207, 1209, 1189, 1021, 1233], [1250, 1185, 1228, 1009, 1175], [1010, 1132, 1136, 1187, 1006]]


In [3]:
#--------------------------------------------------------------------------------------------------
# MPR 계산
# => 정답 리스트[2,3,1,4] 과 예측검색리스트[[1,2,5,1],[3,4,2,1],[6,5,4,1], [2,3,4,1]]를 입력하여 MRR 스코어 구함
##--------------------------------------------------------------------------------------------------
from myutils import mean_reciprocal_rank

# 정답, 여기서는 contextid를 리스트로 만듬.
ground_truths_list = df_questions['contextid'].values.tolist()
#print(f'gtlen:{len(ground_truths_list)}')
#print(ground_truths_list[0:9])

# MRR 계산
bi_ranks, bi_score = mean_reciprocal_rank(ground_truths_list, bi_predictions_list)

# BI-MRR 출력
print(f'----------------------------------------------------------------------------')
print('*BI-MRR:{:.4f}'.format(bi_score))
print(f'*Ranks({len(bi_ranks)}):{bi_ranks[0:10]}')

# 10개씩 출력해봄.
if len(bi_ranks) > 10:
    print()
    print(f'BI_RANKS 10개씩 출력')
    print('------------------------------------------------------------------------------')
    subarrays = [bi_ranks[i:i+10] for i in range(0, len(bi_ranks), 10)]
    # Print the resulting subarrays
    for i, subarray in enumerate(subarrays):
        print(f"{i}: {subarray}")
    
# 검색 한 계슈
#logger.info(f'---------------------------------------------------------------------------')
search_count = 0
nosearch_count = 0
nosearch_list = []
for i,item in enumerate(bi_ranks):
    if item != 0:
        search_count += 1
    else:
        nosearch_count += 1
        nosearch_list.append(i)
    
print('*검색률: {}/{}({:.2f}%)'.format(search_count, len(bi_ranks), (search_count/len(bi_ranks))*100))
print(f'---------------------------------------------------------------------------')

print()
print('*검색실패 : {}'.format(nosearch_count))
for i, nosearch in enumerate(nosearch_list):
    print(f'[{nosearch}] : {df_questions["question"][nosearch]}')

----------------------------------------------------------------------------
*BI-MRR:0.3616
*Ranks(252):[1.0, 1.0, 0, 0.25, 1.0, 1.0, 0, 0, 0.25, 1.0]

BI_RANKS 10개씩 출력
------------------------------------------------------------------------------
0: [1.0, 1.0, 0, 0.25, 1.0, 1.0, 0, 0, 0.25, 1.0]
1: [1.0, 0, 0, 0.5, 1.0, 1.0, 0, 0, 1.0, 0]
2: [0, 0.25, 0.2, 1.0, 0.25, 1.0, 0, 0, 0, 0.5]
3: [0, 1.0, 0, 0.2, 0, 1.0, 0, 0, 0, 1.0]
4: [0, 0.5, 1.0, 0.3333333333333333, 0, 0.25, 0, 1.0, 0, 0.5]
5: [0, 0, 0, 0, 0, 0, 0, 0, 0.5, 0.2]
6: [0, 0, 0.2, 0.5, 0, 0, 0, 0, 0, 1.0]
7: [0, 1.0, 0, 0, 1.0, 0, 0, 0, 1.0, 0]
8: [1.0, 0.3333333333333333, 0.5, 0, 0, 0.5, 0, 1.0, 1.0, 0]
9: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
10: [0, 0, 0, 0, 0, 0, 0, 1.0, 0, 0]
11: [0.5, 0, 0.5, 0, 0.3333333333333333, 1.0, 0, 0.25, 0.3333333333333333, 0]
12: [1.0, 0.5, 0, 0, 0.3333333333333333, 0.3333333333333333, 1.0, 1.0, 0, 1.0]
13: [1.0, 0.25, 1.0, 0.25, 0.5, 1.0, 0, 1.0, 0, 0]
14: [0, 0, 0, 0.5, 0, 0, 0, 1.0, 0.5, 0.33333333