### 1. Setup

In [10]:
import json
import getpass
import os
from tqdm import tqdm
import requests
import time
import uuid
from uuid import uuid4
from pathlib import Path

In [65]:
!pip install beautifulsoup4 lxml
!pip install unstructured
!pip install -qU "langchain-chroma>=0.1.2"



In [23]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

In [101]:
import requests

# Disable SSL verification globally
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)

In [None]:
os.environ["NCP_CLOVASTUDIO_API_KEY"] = getpass.getpass("NCP CLOVA Studio API Key: ")
os.environ["NCP_APIGW_API_KEY"] = getpass.getpass("NCP API Gateway API Key: ")

In [6]:
os.environ["NCP_CLOVASTUDIO_APP_ID"] = input("NCP CLOVA Studio App ID: ")

In [21]:
os.environ["NCP_CLOVASTUDIO_APP_ID_SEGMENTATION"] = input("NCP CLOVA Studio Segmentation App ID: ")

In [7]:
from langchain_community.embeddings import ClovaXEmbeddings
 
clovax_embeddings = ClovaXEmbeddings(model='bge-m3') # 임베딩 모델을 설정해주세요


### 2. Load

In [11]:
import subprocess
 
url_to_filename_map = {}
  
with open("clovastudiourl.txt", "r") as file:
    urls = [url.strip() for url in file.readlines()]
  
folder_path = "clovastudioguide"
  
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
  
for url in urls:
    filename = url.split("/")[-1] + ".html"
    file_path = os.path.join(folder_path, filename)
    subprocess.run(["wget", "--user-agent=RAGCookbook-Crawler/1.0", "-O", file_path, url], check=True)
    url_to_filename_map[url] = filename
  
with open("url_to_filename_map.json", "w") as map_file:
    json.dump(url_to_filename_map, map_file)

--2024-11-13 09:14:46--  https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1
Resolving deviparikh.medium.com (deviparikh.medium.com)... 162.159.153.4, 162.159.152.4, 2606:4700:7::a29f:9904, ...
Connecting to deviparikh.medium.com (deviparikh.medium.com)|162.159.153.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘clovastudioguide/how-we-write-rebuttals-dc84742fece1.html’

     0K .......... .......... .......... .......... .......... 3.17M
    50K .......... .......... .......... .......... .......... 3.75M
   100K .......... .......... .......... .......... .......... 2.90M
   150K .......... .......... .......... .......... .......    12.2M=0.05s

2024-11-13 09:14:46 (3.93 MB/s) - ‘clovastudioguide/how-we-write-rebuttals-dc84742fece1.html’ saved [201960]



In [15]:
#!pip install beautifulsoup4 lxml #필요시 beautifulsoup4 설치
from langchain_community.document_loaders import BSHTMLLoader
 
# 폴더 이름에 맞게 수정
html_files_dir = Path('clovastudioguide')
  
html_files = list(html_files_dir.glob("*.html"))
  
# 모든 문서 데이터를 저장할 리스트 초기화
clovastudiodatas = []
  
# 찾은 HTML 파일들에 대해 처리
for html_file in html_files:
    # 각 파일에 대해 BSHTMLLoader 인스턴스 생성
    loader = BSHTMLLoader(str(html_file))
    document_data = loader.load()
    # 로드된 문서 데이터를 리스트에 추가
    clovastudiodatas.append(document_data)
    print(f"Processed {html_file}")

Processed clovastudioguide/how-we-write-rebuttals-dc84742fece1.html


In [17]:
#!pip install unstructured #필요시 unstructured 설치
from langchain_community.document_loaders import UnstructuredHTMLLoader
 
# 폴더 이름에 맞게 수정
html_files_dir = Path('./clovastudioguide')
  
html_files = list(html_files_dir.glob("*.html"))
  
clovastudiodatas = []
  
for html_file in html_files:
    loader = UnstructuredHTMLLoader(str(html_file))
    document_data = loader.load()
    clovastudiodatas.append(document_data)
    print(f"Processed {html_file}")

Processed clovastudioguide/how-we-write-rebuttals-dc84742fece1.html


In [18]:
with open("url_to_filename_map.json", "r") as map_file:
    url_to_filename_map = json.load(map_file)
  
filename_to_url_map = {v: k for k, v in url_to_filename_map.items()}
  
# clovastudiodatas 리스트의 각 Document 객체의 'source' 수정
for doc_list in clovastudiodatas:
    for doc in doc_list:
        extracted_filename = doc.metadata["source"].split("/")[-1]
        if extracted_filename in filename_to_url_map:
            doc.metadata["source"] = filename_to_url_map[extracted_filename]
        else:
            print(f"Warning: {extracted_filename}에 해당하는 URL을 찾을 수 없습니다.")
print(doc.metadata["source"])


https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1


In [19]:
# 이중 리스트를 풀어서 하나의 리스트로 만드는 작업
clovastudiodatas_flattened = [item for sublist in clovastudiodatas for item in sublist]

In [20]:
# 처음 3개 문서의 URL과 내용 일부 출력
for doc in clovastudiodatas_flattened[:2]:
    print(f"URL: {doc.metadata['source']}")
    print(f"{doc.page_content[:100]}...")
    print("-" * 50)

URL: https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1
Open in app

Sign in

Write

Sign in

How we write rebuttals

Devi Parikh

Follow

10 min read

May ...
--------------------------------------------------


### 3. Chunking

In [24]:
import http.client
from http import HTTPStatus
  
class CLOVAStudioExecutor:
    def __init__(self, host, request_id=None):
        self._host = host
        self._api_key = os.environ["NCP_CLOVASTUDIO_API_KEY"]
        self._api_key_primary_val = os.environ["NCP_APIGW_API_KEY"]
        self._request_id = request_id or str(uuid.uuid4())
  
    def _send_request(self, completion_request, endpoint):
        headers = {
            'Content-Type': 'application/json; charset=utf-8',
            'X-NCP-CLOVASTUDIO-API-KEY': self._api_key,
            'X-NCP-APIGW-API-KEY': self._api_key_primary_val,
            'X-NCP-CLOVASTUDIO-REQUEST-ID': self._request_id
        }
  
        conn = http.client.HTTPSConnection(self._host)
        conn.request('POST', endpoint, json.dumps(completion_request), headers)
        response = conn.getresponse()
        status = response.status
        result = json.loads(response.read().decode(encoding='utf-8'))
        conn.close()
        return result, status
  
    def execute(self, completion_request, endpoint):
        res, status = self._send_request(completion_request, endpoint)
        if status == HTTPStatus.OK:
            return res, status
        else:
            error_message = res.get("status", {}).get("message", "Unknown error") if isinstance(res, dict) else "Unknown error"
            raise ValueError(f"오류 발생: HTTP {status}, 메시지: {error_message}")
  
class SegmentationExecutor(CLOVAStudioExecutor):
    def execute(self, completion_request):
        app_id = os.environ["NCP_CLOVASTUDIO_APP_ID_SEGMENTATION"]
        endpoint = f'/testapp/v1/api-tools/segmentation/{app_id}'
        res, status = super().execute(completion_request, endpoint)
        if status == HTTPStatus.OK and "result" in res:
            return res["result"]["topicSeg"]
        else:
            error_message = res.get("status", {}).get("message", "Unknown error") if isinstance(res, dict) else "Unknown error"
            raise ValueError(f"오류 발생: HTTP {status}, 메시지: {error_message}")
         
if __name__ == "__main__":
    # 환경 변수가 설정되어 있는지 확인
    required_env_vars = [
        "NCP_CLOVASTUDIO_API_KEY",
        "NCP_APIGW_API_KEY",
        "NCP_CLOVASTUDIO_APP_ID_SEGMENTATION"
    ]
     
    missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
    if missing_vars:
        raise ValueError(f"Missing required environment variables: {', '.join(missing_vars)}")
     
    segmentation_executor = SegmentationExecutor(
        host="clovastudio.apigw.ntruss.com"
    )
  
    chunked_html = []
  
    for htmldata in tqdm(clovastudiodatas_flattened):
        try:
            request_data = {
                "postProcessMaxSize": 100,
                "alpha": -100,
                "segCnt": -1,
                "postProcessMinSize": -1,
                "text": htmldata.page_content,
                "postProcess": True
            }
              
            response_data = segmentation_executor.execute(request_data)
            result_data = [' '.join(segment) for segment in response_data]
      
            for paragraph in result_data:
                chunked_document = {
                    "metadata": htmldata.metadata["source"],
                    "page_content": paragraph
                }
                chunked_html.append(chunked_document)
      
        except Exception as e:
            print(f"Error processing data from {htmldata.metadata['source']}: {e}")
            # 오류 발생 시 현재 반복을 건너뛰고 다음으로 진행
            continue
   
    print(len(chunked_html))

100%|██████████| 1/1 [00:03<00:00,  3.77s/it]

113





In [27]:
chunked_html[40]

{'metadata': 'https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1',
 'page_content': 'And then give details, describe context, or explain your position.'}

In [28]:
# 청크 분석 및 출력
print(f"\n총 청크 수: {len(chunked_html)}")
 
# 샘플 청크 출력
print("\n샘플 청크 (처음 3개):")
for i, chunk in enumerate(chunked_html[:3], 1):
    print(f"\n청크 {i}:")
    print(f"메타데이터: {chunk['metadata']}")
    print(f"내용: {chunk['page_content']}")
    print(f"길이: {len(chunk['page_content'])} 문자")



총 청크 수: 113

샘플 청크 (처음 3개):

청크 1:
메타데이터: https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1
내용: Open in app Sign in Write Sign in How we write rebuttals Devi Parikh Follow 10 min read May 27, 2020 --
길이: 103 문자

청크 2:
메타데이터: https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1
내용: By Devi Parikh, Dhruv Batra, Stefan Lee We frequently find ourselves giving the same advice to different students on how to write rebuttals.
길이: 140 문자

청크 3:
메타데이터: https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1
내용: So we thought we’d write it up. Our experience is with AI conferences (e.g., CVPR, ECCV, ICCV, NeurIPS, ICLR, EMNLP).The core guiding principle is that the rebuttal should be thorough, direct, and easy for the Reviewers and Area Chair (RACs) to follow.
길이: 252 문자


### 4. Embedding

In [36]:
import requests 
from requests.packages.urllib3.exceptions import InsecureRequestWarning 
# Suppress only the single InsecureRequestWarning from urllib3 needed for this snippet 
requests.packages.urllib3.disable_warnings(InsecureRequestWarning) 
# Modify the default behavior of the requests to not verify SSL certificates 
requests.Session().verify = False

In [None]:
# -*- coding: utf-8 -*-

class EmbeddingExecutor:
    def __init__(self, host, api_key, api_key_primary_val, request_id):
        self._host = host
        self._api_key = api_key
        self._api_key_primary_val = api_key_primary_val
        self._request_id = request_id

    def _send_request(self, completion_request):
        headers = {
            'Content-Type': 'application/json; charset=utf-8',
            'X-NCP-CLOVASTUDIO-API-KEY': self._api_key,
            'X-NCP-APIGW-API-KEY': self._api_key_primary_val,
            'X-NCP-CLOVASTUDIO-REQUEST-ID': self._request_id
        }

        conn = http.client.HTTPSConnection(self._host)
        conn.request('POST', '/testapp/v1/api-tools/embedding/v2/2c9ce8f0ca6843e49d98ff0ea46efdd2', json.dumps(completion_request), headers)
        response = conn.getresponse()
        result = json.loads(response.read().decode(encoding='utf-8'))
        conn.close()
        return result

    def execute(self, completion_request):
        res = self._send_request(completion_request)
        if res['status']['code'] == '20000':
            return res['result']['embedding']
        else:
            return 'Error'


In [60]:
embedding_executor = EmbeddingExecutor(
    host='clovastudio.apigw.ntruss.com',
    api_key= os.environ["NCP_CLOVASTUDIO_API_KEY"],
    api_key_primary_val = os.environ["NCP_APIGW_API_KEY"],
    request_id= os.environ["NCP_CLOVASTUDIO_APP_ID"]
)

request_data = json.loads("""{
"text" : "input text"
}""", strict=False)

response_text = embedding_executor.execute(request_data)
print(request_data)
print(response_text)
print(len(response_text))

{'text': 'input text'}
[-0.9326172, 0.69628906, -0.515625, 0.12683105, -0.29736328, -0.55615234, 0.2388916, 0.15234375, 0.40039062, -0.68359375, 0.29541016, -0.047424316, 0.36669922, -0.81689453, -0.2619629, -0.34375, -0.16601562, 0.36743164, 0.17993164, -0.2705078, -0.80078125, -0.24499512, -0.08093262, 0.34033203, 0.47387695, 0.9682617, 0.2866211, -0.42407227, -0.58154297, -0.20178223, 0.87646484, 0.0181427, -0.7939453, -2.0625, 0.31201172, -0.11657715, -0.12573242, -0.3149414, -1.5751953, 0.27807617, 0.087524414, -0.59716797, 0.55615234, -0.022842407, -0.27807617, -0.081970215, 1.0087891, -0.39575195, -0.7050781, -0.5683594, 0.054107666, 0.52685547, 1.8037109, -0.1784668, -0.31225586, -0.5307617, -0.2388916, 0.16149902, -1.6572266, -0.07635498, -0.2788086, -0.54296875, -0.48706055, -0.27929688, -0.21374512, 1.7119141, 0.91308594, 0.13464355, -0.703125, -0.67089844, -0.33496094, 0.7866211, -0.3413086, -0.08496094, -1.2548828, 1.1201172, 0.6401367, -0.40893555, -0.41479492, 0.6435547,

In [None]:
# ssl 
from langchain_community.embeddings import ClovaXEmbeddings
clovax_embeddings = ClovaXEmbeddings(model='bge-m3') # 임베딩 모델을 설정해주세요

text = "클로바 스튜디오"
clovax_embeddings.embed_query(text)

In [59]:
# non-ssl
text_json = dict(text=text)
request_data = text_json

response_text = embedding_executor.execute(request_data)
print(request_data)
print(response_text)

{'text': '클로바 스튜디오'}
[-0.6166992, -0.4165039, -2.1074219, 0.8017578, -0.49853516, 1.3476562, -1.3261719, 0.06732178, 0.0234375, -0.50341797, 0.55322266, -0.27612305, 0.5678711, -0.78564453, 0.6796875, 0.038146973, -0.8486328, -0.6035156, 0.51123047, 0.07965088, -0.6401367, -0.63671875, 0.11090088, -0.5493164, 1.6416016, -0.26098633, 1.0253906, -0.40893555, 0.33154297, -0.8769531, -0.6777344, -0.57373047, -0.020431519, -1.1015625, 0.5830078, -1.9179688, 0.5029297, 0.86621094, -2.4472656, 0.23706055, 0.22705078, 2.2851562, 0.6088867, -0.3737793, 0.19958496, -0.0044174194, -0.36547852, -0.44262695, -0.78222656, -0.40551758, 0.5419922, 0.56884766, 0.5102539, 0.5102539, -0.2800293, -0.22912598, 0.0703125, -1.3759766, -1.7197266, 0.5546875, -0.99902344, 0.6660156, -0.031021118, 0.21166992, 0.7011719, 0.91503906, -0.5629883, -0.14025879, -0.24206543, 0.4267578, -0.77441406, -0.3701172, 0.1204834, -0.17248535, -1.1767578, 0.73876953, -0.5522461, -1.0126953, -0.12890625, 1.8720703, 1.1689453, -

In [61]:
from langchain_core.documents import Document
 
documents = []
 
for index, item in enumerate(chunked_html):
    doc = Document(
        page_content=str(item['page_content']),
        metadata={"source": item['metadata']},
        id=str(uuid4())
    )
    documents.append(doc)

In [63]:
# documents 구조 체크
for i, doc in enumerate(documents):
    print(f"Document {i}:")
    print(f"  Page Content: {doc.page_content[:100]}...")  # 첫 100자만 출력
    print(f"  Metadata: {doc.metadata}")
    print(f"  Page Content Type: {type(doc.page_content)}")
    print(f"  Metadata Type: {type(doc.metadata)}")
    print("-" * 40)

Document 0:
  Page Content: Open in app Sign in Write Sign in How we write rebuttals Devi Parikh Follow 10 min read May 27, 2020...
  Metadata: {'source': 'https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1'}
  Page Content Type: <class 'str'>
  Metadata Type: <class 'dict'>
----------------------------------------
Document 1:
  Page Content: By Devi Parikh, Dhruv Batra, Stefan Lee We frequently find ourselves giving the same advice to diffe...
  Metadata: {'source': 'https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1'}
  Page Content Type: <class 'str'>
  Metadata Type: <class 'dict'>
----------------------------------------
Document 2:
  Page Content: So we thought we’d write it up. Our experience is with AI conferences (e.g., CVPR, ECCV, ICCV, NeurI...
  Metadata: {'source': 'https://deviparikh.medium.com/how-we-write-rebuttals-dc84742fece1'}
  Page Content Type: <class 'str'>
  Metadata Type: <class 'dict'>
----------------------------------------
Docu

### 5. VectorStore

In [None]:
import chromadb
from langchain_chroma import Chroma
 
# 임베딩 모델 정의
#clovax_embeddings = ClovaXEmbeddings(model='bge-m3')
 
# 로컬 클라이언트 경로 지정
client = chromadb.PersistentClient(path="./chroma_langchain_db") #저장할 로컬 경로
 
# Chroma 컬렉션 생성
chroma_collection = client.get_or_create_collection(
    name="clovastudiodatas_docs", #collection이 바뀔때마다 이름도 꼭 변경해줘야 합니다.
    metadata={"hnsw:space": "cosine"} #사용하는 임베딩 모델에 따라 ‘l2’, 'ip', ‘cosine’ 중에 사용
)
 
# Chroma 벡터 저장소 생성
vectorstore = Chroma(
    client=client,
    collection_name="clovastudiodatas_docs",
    embedding_function=clovax_embeddings
)
 
# tqdm으로 for 루프 감싸기
for doc in tqdm(documents, desc="Adding documents", total=len(documents)):
    #print(doc)
    doc_json = dict(text=doc.page_content)
    #print(doc_json)
    embeddings = embedding_executor.execute(doc_json)
    #print(embeddings)
    #embeddings = clovax_embeddings.embed_documents([doc.page_content])[0]
    # 문서 추가
    chroma_collection.add(
        ids=[str(uuid.uuid4())],  # 고유한 ID 생성
        documents=[doc.page_content],
        embeddings=[embeddings],
        metadatas=[doc.metadata]
    )
    time.sleep(1.1)  # 이용량 제어를 고려한 1초 이상의 딜레이, 필요에 따라 조정 가능
 
print("All documents have been added to the vectorstore.")

In [98]:
def embedding_wrapper(doc):
    query = doc[0].page_content
    query_json = dict(text=query)
    response = embedding_executor.execute(query_json)
    return response

In [None]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
 
# 임베딩 모델 정의
clovax_embeddings = ClovaXEmbeddings(model='bge-m3')
 
# FAISS 벡터 저장소 생성
vectorstore_FAISS = FAISS(
    embedding_function=clovax_embeddings,
    index=faiss.IndexFlatIP(1024), # 임베딩 차원 크기
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
 
# tqdm으로 for 루프 감싸기
for doc in tqdm(documents, desc="Adding documents", total=len(documents)):
    embeddings = clovax_embeddings.embed_documents([doc.page_content])[0]
    # 문서 추가
    vectorstore_FAISS.add_documents(
        ids=[str(uuid.uuid4())],  # 고유한 ID 생성
        documents=[doc],  # 문자열만 전달
        embeddings=[embeddings]
    )
    time.sleep(1.4)  # 이용량 제어를 고려한 1초 이상의 딜레이, 필요에 따라 조정 가능
 
print("All documents have been added to the vectorstore.")

Adding documents:   0%|          | 0/113 [00:00<?, ?it/s]

Adding documents:   0%|          | 0/113 [00:00<?, ?it/s]


ConnectError: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self-signed certificate in certificate chain (_ssl.c:1129)