## Colab Imports

In [None]:
from google.colab import drive
from google.colab import userdata
drive.mount('/content/drive')

## Installation
- FAISS
- Pinecone

In [None]:
# FAISS
!pip install -qU langchain-community faiss-cpu langchain-openai

In [None]:
# Pinecone
!pip install langchain langchain-core langchain-community langchain-openai langchain-pinecone

## Documents Preparation
- Same for both FAISS and Pinecone
  - Except `uuids` generation

In [None]:
import os
import json

JSON_DIR_PATH = '/content/drive/MyDrive/RAG_JSON_EMBEDDINGS_INDEX'
HA_RAG_DATA_PATH = os.path.join(JSON_DIR_PATH, "JSON/HA_RAG_DATA")
if not os.path.exists(HA_RAG_DATA_PATH):
    os.makedirs(HA_RAG_DATA_PATH)

json_path = os.path.join(HA_RAG_DATA_PATH, "combined_common.json")

with open(os.path.join(HA_RAG_DATA_PATH, json_path), "r") as f:
        data = json.load(f)

In [None]:
documents = []

for row in data:
  text = f"{row['ad_gu']} [SEP] {row['ad_dong']} [SEP] {row['address']} [SEP] {row['location']} [SEP] " \
               f"{row['description']} [SEP] {row['rating']} [SEP] {row['share_link']} [SEP] " \
               f"{' '.join(row['reviews'])} [SEP] {row['info']}"
  clean_text = text.replace("\n", " ")
  documents.append(Document(page_content=clean_text))

# Only for Faiss -> comment it out for Pinecone
uuids = [str(uuid4()) for _ in range(len(documents))]

## Index Initialization and Saving
- FAISS
- Pinecone

In [None]:
# Embeddings for both FAISS and Pinecone
from langchain_openai import OpenAIEmbeddings

os.environ["OPENAI_API_KEY"] = userdata.get('openAI')

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

### FAISS Index Initilization and Saving

In [None]:
# Index Path(Directory to save initialized index)
INDEX_DIR_PATH = os.path.join(DRIVE_PATH, "INDEX")
HA_INDEX_PATH = os.path.join(INDEX_DIR_PATH, "HA_INDEX")
if not os.path.exists(INDEX_DIR_PATH):
    os.makedirs(INDEX_DIR_PATH)

faiss_index_path = os.path.join(HA_INDEX_PATH, "combined_faiss_index")

In [None]:
# FAISS init
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

index_cpu = faiss.IndexFlatL2(len(embeddings.embed_query(documents[0].page_content)))
vector_store = FAISS(
    embedding_function=embeddings,
    index=index_cpu,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [None]:
# Add documents to faiss
vector_store.add_documents(documents=documents, ids=uuids)
vector_store.save_local(faiss_index_path)

In [None]:
# Testing
retriever = vector_store.as_retriever(saerch_type="mmr", search_kwargs={"k": 5})
retriever.invoke("성북구에서 맞선")

[Document(page_content='성북구 [SEP] 삼선동 [SEP] 서울 성북구 삼선교로10길 14 [SEP] 우리게임장2 [SEP] None [SEP] None [SEP] https://naver.me/GVAlH8bX [SEP] None [SEP] None'),
 Document(page_content='성북구 [SEP] 삼선동 [SEP] 서울 성북구 동소문로6길 22 [SEP] 기적을 만나는 방 [SEP] None [SEP] None [SEP] https://naver.me/GJTvCWfF [SEP]  [SEP] None'),
 Document(page_content='성북구 [SEP] 종암동 [SEP] 서울 성북구 종암동 2-1 [SEP] 우연찻집 [SEP] None [SEP] None [SEP] https://naver.me/xJia0I8a [SEP]  [SEP] None'),
 Document(page_content='성북구 [SEP] 성북동 [SEP] 서울 성북구 성북로23길 81 [SEP] 북정마을팔각정 [SEP] None [SEP] None [SEP] https://naver.me/5hgoVGNH [SEP] 좋아요 [SEP] None'),
 Document(page_content='성북구 [SEP] 장위1동 [SEP] 서울 성북구 장월로8가길 13 [SEP] 작은공간 [SEP] None [SEP] None [SEP] https://naver.me/x4FioYKA [SEP] None [SEP] None')]

### Pinecone Index Initilization

In [None]:
# Pinecone Setting
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

os.environ["PINECONE_API_KEY"] = userdata.get('pinecone-KEY')
pinecone_api_key = os.environ.get("PINECONE_API_KEY")

index_name = 'ha-rag-index-test'
pinecone_object = Pinecone(api_key=pinecone_api_key)

In [None]:
# Saving & Embedding Documents to Pinecone
database = PineconeVectorStore.from_documents(
    documents,
    embeddings,
    index_name=index_name
)