In [None]:
#----------------------------------------------------------------------
# post로 search 하는 예시
#
# http://10.10.4.10:9000/search/?esindex=test3
#
# curl -X 'POST' \
#   'http://10.10.4.10:9000/search/?esindex=test3' \
#   -H 'accept: application/json' \
#   -H 'Content-Type: application/json' \
#   -d '{
#   "query": "사업계획서",
#   "search_size": 2
# }'
#----------------------------------------------------------------------
import requests
import json

FASTAPI_URL = "http://10.10.4.10:9000/search/"
ES_INDEX = 'test3'
SEARCH_SIZE = 5

def run_embedding_query_loop():
    while True:
        try:
            handle_query_embedding()
        except KeyboardInterrupt:
            return
        
def handle_query_embedding():
    #url
    url = FASTAPI_URL + "?esindex=" + ES_INDEX
    print(f'url:{url}')

    #headers
    headers = {
        "Content-Type": "application/json"
    }

    #data
    #query = "my name is bongsoo. who are you?"
    #query = "안녕하세요.지금은 curl 테스트 입니다."
    query = input("검색 문장 입력: ")

    # json 구조
    temp = {
        "query": query,
        "search_size": SEARCH_SIZE,
    }

    # dict 을 json으로 변환
    data = json.dumps(temp)
    
    try:
        response = requests.post(url, headers=headers, data=data)
    except requests.exceptions.Timeout as errd:
        print("Timeout Error:", errd)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Connection Error:", errc)
        return
    except requests.exceptions.HTTPError as errb:
        print("Http Error:", errb)
        return
    except requests.exceptions.RequestException as erra:#Any Error except
        print("AnyException Error:", erra)
        return
    
    print(type(response.text))
    print("response.text: ", response.text)
    print()
    
    
run_embedding_query_loop()

In [None]:
#----------------------------------------------------------------------
# get으로 search 하는 예시
#
# http://10.10.4.10:9000/search/test3?query=%EC%82%AC%EC%97%85%EA%B3%84%ED%9A%8D%EC%84%9C&search_size=3
#
# curl -X 'GET' \
#   'http://10.10.4.10:9000/search/test3?query=%EC%82%AC%EC%97%85%EA%B3%84%ED%9A%8D%EC%84%9C&search_size=3' \
#   -H 'accept: application/json'
#----------------------------------------------------------------------
import requests
import json

FASTAPI_URL = "http://10.10.4.10:9000/search/"
ES_INDEX = 'test3'
SEARCH_SIZE = 5

def run_embedding_query_loop():
    while True:
        try:
            handle_query_embedding()
        except KeyboardInterrupt:
            return
        
def handle_query_embedding():
    #url
 
    #headers
    headers = {
        "Content-Type": "application/json"
    }

    #data
    #query = "my name is bongsoo. who are you?"
    #query = "안녕하세요.지금은 curl 테스트 입니다."
    query = input("검색 문장 입력: ")

    url = FASTAPI_URL + ES_INDEX + "?query=" + query + "&search_size=" + str(SEARCH_SIZE)
    print(f'url:{url}')
    
    try:
        response = requests.get(url, headers=headers)
    except requests.exceptions.Timeout as errd:
        print("Timeout Error:", errd)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Connection Error:", errc)
        return
    except requests.exceptions.HTTPError as errb:
        print("Http Error:", errb)
        return
    except requests.exceptions.RequestException as erra:#Any Error except
        print("AnyException Error:", erra)
        return
    
    print(type(response.text))
    print("response.text: ", response.text)
    print()
    
    
run_embedding_query_loop()

In [1]:
#----------------------------------------------------------------------
# post로 문서 임베딩 예시
#
# http://10.10.4.10:9000/embed/es/?esindex=test&createindex=false
#
# curl -X 'POST' \
#   'http://10.10.4.10:9000/embed/es/?esindex=test&createindex=false' \
#   -H 'accept: application/json' \
#   -H 'Content-Type: application/json' \
#   -d '{
#   "uids": [
#     "5"
#   ],
#   "titles": [
#     "경찰청장 한상균 소요죄적용 입장 변함없어"
#   ],
#   "documents": [
#     "강신명 경찰청장은 5일 한상균 민주노총 위원장에게 소요죄를 적용하지 않고 기소할 것으로 알려진 검찰의 방침과 관련..."
#   ]
# }
#----------------------------------------------------------------------
import numpy as np
import pandas as pd
import time
import os

from os import sys
sys.path.append('../')
from myutils import remove_reverse, getListOfFiles, clean_text
from tqdm.notebook import tqdm


OUT_FOLDER = '../../data11/mpower_doc/out/' # 추출된 TEXT 파일들이 있는 루트폴더

# OUT_FOLDER에 모든 파일 경로를 얻어옴.
file_paths = getListOfFiles(OUT_FOLDER)
assert len(file_paths) > 0 # files가 0이면 assert 발생
    
print('*file_count: {}, file_list:{}'.format(len(file_paths), file_paths[0:5]))

contexts = []
titles = []
contextids = []

# TEXT 추출된 파일들을 읽어오면서 제목(title), 내용(contexts) 등을 저장해 둠.
contextid = 1000
for idx, file_path in enumerate(tqdm(file_paths)):
    if '.ipynb_checkpoints' not in file_path:
        sentences = []
        with open(file_path, 'r', encoding='utf-8') as f:
            data = f.read()
            
            #.PAGE:1 패턴을 가지는 문장은 제거함.
            pattern = r"\.\.PAGE:\d+\s?"
            data = clean_text(text=data, pattern=pattern)
            
            file_name = os.path.basename(file_path)  # 파일명만 뽑아냄
            
            #  filename = 5.보안사업부 사업계획.hwp.txt 이면 뒤에 hwp.txt는 제거하고 '5.보안사업부 사업계획' 문자열만 title로 저장함.
            file_name = remove_reverse(file_name, '.')# 5.보안사업부 사업계획.hwp 출력됨
            file_name = remove_reverse(file_name, '.')# 5.보안사업부 사업계획 출력됨
            
            contextid += 1
            contexts.append(data)     # 파일 내용 저장 
            titles.append(file_name)  # 파일명을 제목으로 저장(추후 쿼리할 문장이 됨)
            contextids.append(contextid) # contextid 저장 
 
# 데이터 프레임으로 만듬.
df_contexts = pd.DataFrame((zip(contexts, titles, contextids)), columns = ['context','question', 'contextid'])

print(f'*len(contexts): {len(contexts)}')



*file_count: 302, file_list:['../../data11/mpower_doc/out/9/Autotools를 이용한 리눅스 응용프로그램 빌드.pptx.txt', '../../data11/mpower_doc/out/9/Apache에서 Tomcat 사용하기.htm.txt', '../../data11/mpower_doc/out/9/BERT 모델 STS 성능 비교분석.pptx.txt', '../../data11/mpower_doc/out/9/클라우드서비스 구축 신청서.hwp.txt', '../../data11/mpower_doc/out/9/클라우드서비스 보안인증제 운영 변경관리서.hwp.txt']


  0%|          | 0/302 [00:00<?, ?it/s]

*len(contexts): 253


In [2]:
def index_batch(url:str, headers, docs):
    
    uids = []
    titles=[]
    documents=[]
    
    for doc in docs:
        uids.append(doc['uid'])
        titles.append(doc['title'])
        documents.append(doc['document'])
        
    #print(len(uids))
    print(f'uids:{uids}')
    
    temp = {
            "uids": uids,
            "titles": titles,
            "documents": documents,
        }
    
    data = json.dumps(temp)
    
    try:
        response = requests.post(url, headers=headers, data=data)
    except requests.exceptions.Timeout as errd:
        print("Timeout Error:", errd)
        return
    except requests.exceptions.ConnectionError as errc:
        print("Connection Error:", errc)
        return
    except requests.exceptions.HTTPError as errb:
        print("Http Error:", errb)
        return
    except requests.exceptions.RequestException as erra:#Any Error except
        print("AnyException Error:", erra)
        return
    
    #print(type(response.text))
    print("responset: ", response)
    print()

In [3]:
import requests
import json
from tqdm.notebook import tqdm

FASTAPI_URL = "http://10.10.4.10:9000/embed/es/"
ES_INDEX = 'mpower-kpf-128d-f16-1'
SEARCH_SIZE = 5

url = FASTAPI_URL + "?esindex=" + ES_INDEX
print(url)

#headers
headers = {
        "Content-Type": "application/json"
    }

uids = df_contexts['contextid'].values.tolist()
titles = df_contexts['question'].values.tolist()
documents = df_contexts['context'].values.tolist()

docs = []
batch_count = 0

start = time.time()
for uid, title, document in tqdm(zip(uids, titles, documents)):
    
    doc = {} #dict 선언
    doc['uid'] = uid      
    doc['title'] = title     
    doc['document'] = document
    docs.append(doc)
        
    batch_count += 1
    
    if batch_count % 10 == 0:
        index_batch(url=url, headers=headers, docs=docs)
        docs = []
        
if docs:
    index_batch(url=url, headers=headers, docs=docs)
    docs = []   
    
print(f'*임베딩 시간 : {time.time()-start:.4f}\n')

http://10.10.4.10:9000/embed/es/?esindex=mpower-kpf-128d-f16-1


0it [00:00, ?it/s]

uids:[1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010]
responset:  <Response [200]>

uids:[1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020]
responset:  <Response [200]>

uids:[1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 1030]
responset:  <Response [200]>

uids:[1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 1040]
responset:  <Response [200]>

uids:[1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 1050]
responset:  <Response [200]>

uids:[1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1060]
responset:  <Response [200]>

uids:[1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 1070]
responset:  <Response [200]>

uids:[1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1080]
responset:  <Response [200]>

uids:[1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090]
responset:  <Response [200]>

uids:[1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 1100]
responset:  <Response [200]>

uids:[1101, 1102, 1103, 1104, 1105, 1106