## Colab Imports

In [2]:
from google.colab import drive
from google.colab import userdata
drive.mount('/content/drive')

Mounted at /content/drive


## Installations

In [1]:
!pip install -qU langchain-community faiss-cpu langchain-openai tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m694.1 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m399.9/399.9 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.2/292.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Json loading and directory path set

In [3]:
import os
import json

JSON_DIR_PATH = '/content/drive/MyDrive/RAG_JSON_EMBEDDINGS_INDEX'
HA_RAG_DATA_PATH = os.path.join(JSON_DIR_PATH, "JSON/HA_RAG_DATA")
if not os.path.exists(HA_RAG_DATA_PATH):
    os.makedirs(HA_RAG_DATA_PATH)

json_path = os.path.join(HA_RAG_DATA_PATH, "combined_common.json")

with open(os.path.join(HA_RAG_DATA_PATH, json_path), "r") as f:
        data = json.load(f)

In [4]:
# Index Path(Directory to save initialized index)
DRIVE_PATH = '/content/drive/MyDrive/RAG_JSON_EMBEDDINGS_INDEX'
INDEX_DIR_PATH = os.path.join(DRIVE_PATH, "INDEX")
HA_INDEX_PATH = os.path.join(INDEX_DIR_PATH, "HA_TEST_INDEX")
if not os.path.exists(INDEX_DIR_PATH):
    os.makedirs(INDEX_DIR_PATH)

if not os.path.exists(HA_INDEX_PATH):
    os.makedirs(HA_INDEX_PATH)

faiss_index_path = os.path.join(HA_INDEX_PATH, "test_faiss_index")

## Less Tokens: Meta Data

In [28]:
test_documents_meta = []
import uuid
from uuid import uuid4
from langchain.schema import Document

for row in data:
  # Text
  text = f"{row['address']} [SEP] {row['location']} [SEP] " \
  f"{row['description']} [SEP] {row['rating']} [SEP] {row['share_link']} [SEP] " \
  f"{' '.join(row['reviews'])} [SEP] {row['info']}"
  # Metadata
  metadata = {
      "ad_gu": row['ad_gu'],
      "ad_dong": row['ad_dong'],
  }
  clean_text = text.replace("\n", " ")
  test_documents_meta.append(Document(
      page_content=clean_text,
      metadata=metadata
  ))

# Only for Faiss -> comment it out for Pinecone
uuids = [str(uuid4()) for _ in range(len(test_documents_meta))]

In [29]:
test_docuemtns_no_meta = []
import uuid
from uuid import uuid4
from langchain.schema import Document

for row in data:
  text = f"{row['ad_gu']} [SEP] {row['ad_dong']} [SEP] {row['address']} [SEP] {row['location']} [SEP] " \
               f"{row['description']} [SEP] {row['rating']} [SEP] {row['share_link']} [SEP] " \
               f"{' '.join(row['reviews'])} [SEP] {row['info']}"
  clean_text = text.replace("\n", " ")
  test_docuemtns_no_meta.append(Document(page_content=clean_text))

# Only for Faiss -> comment it out for Pinecone
uuids = [str(uuid4()) for _ in range(len(test_docuemtns_no_meta))]

In [30]:
import tiktoken
tokenizer = tiktoken.encoding_for_model("text-embedding-3-large")

no_meta_avg_token_num = 0
meta_avg_token_num = 0

for i in range(100):
  no_meta_avg_token_num += len(tokenizer.encode(test_docuemtns_no_meta[i].page_content))
  meta_avg_token_num += len(tokenizer.encode(test_documents_meta[i].page_content))

no_meta_avg_token_num /= 100
meta_avg_token_num /= 100
print(f"No Meta Avg Token Num: {no_meta_avg_token_num}")
print(f"Meta Avg Token Num: {meta_avg_token_num}")

No Meta Avg Token Num: 1057.13
Meta Avg Token Num: 1042.13


## Vector Store Init

In [31]:
# FAISS init
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Embedding Model Selection
os.environ["OPENAI_API_KEY"] = userdata.get('openAI')
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# Index Dimension
index_cpu = faiss.IndexFlatL2(len(embeddings.embed_query(test_documents_meta[0].page_content)))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index_cpu,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)