## Colab Imports

In [1]:
from google.colab import drive
from google.colab import userdata
drive.mount('/content/drive')

Mounted at /content/drive


## Installations

In [2]:
!pip install -qU langchain-community faiss-cpu langchain-openai tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.0/27.0 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m40.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.2/292.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Json loading and directory path set

In [3]:
import os
import json

JSON_DIR_PATH = '/content/drive/MyDrive/RAG_JSON_EMBEDDINGS_INDEX'
HA_RAG_DATA_PATH = os.path.join(JSON_DIR_PATH, "JSON/HA_RAG_DATA")
if not os.path.exists(HA_RAG_DATA_PATH):
    os.makedirs(HA_RAG_DATA_PATH)

json_path = os.path.join(HA_RAG_DATA_PATH, "combined_common.json")

with open(os.path.join(HA_RAG_DATA_PATH, json_path), "r") as f:
        data = json.load(f)

In [4]:
# Index Path(Directory to save initialized index)
DRIVE_PATH = '/content/drive/MyDrive/RAG_JSON_EMBEDDINGS_INDEX'
INDEX_DIR_PATH = os.path.join(DRIVE_PATH, "INDEX")
HA_INDEX_PATH = os.path.join(INDEX_DIR_PATH, "HA_TEST_INDEX")
if not os.path.exists(INDEX_DIR_PATH):
    os.makedirs(INDEX_DIR_PATH)

if not os.path.exists(HA_INDEX_PATH):
    os.makedirs(HA_INDEX_PATH)

faiss_index_path = os.path.join(HA_INDEX_PATH, "test_faiss_index")

## Less Tokens: Meta Data

In [5]:
test_documents_meta = []
import uuid
from uuid import uuid4
from langchain.schema import Document

for row in data:
  # Text
  text = f"{row['address']} [SEP] {row['location']} [SEP] " \
  f"{row['description']} [SEP] {row['rating']} [SEP] {row['share_link']} [SEP] " \
  f"{' '.join(row['reviews'])} [SEP] {row['info']}"
  # Metadata
  metadata = {
      "ad_gu": row['ad_gu'],
      "ad_dong": row['ad_dong'],
  }
  clean_text = text.replace("\n", " ")
  test_documents_meta.append(Document(
      page_content=clean_text,
      metadata=metadata
  ))

# Only for Faiss -> comment it out for Pinecone
uuids = [str(uuid4()) for _ in range(len(test_documents_meta))]

In [6]:
test_docuemtns_no_meta = []
import uuid
from uuid import uuid4
from langchain.schema import Document

for row in data:
  text = f"{row['ad_gu']} [SEP] {row['ad_dong']} [SEP] {row['address']} [SEP] {row['location']} [SEP] " \
               f"{row['description']} [SEP] {row['rating']} [SEP] {row['share_link']} [SEP] " \
               f"{' '.join(row['reviews'])} [SEP] {row['info']}"
  clean_text = text.replace("\n", " ")
  test_docuemtns_no_meta.append(Document(page_content=clean_text))

# Only for Faiss -> comment it out for Pinecone
uuids = [str(uuid4()) for _ in range(len(test_docuemtns_no_meta))]

In [8]:
import tiktoken
tokenizer = tiktoken.encoding_for_model("text-embedding-3-small")

no_meta_avg_token_num = 0
meta_avg_token_num = 0

for i in range(100):
  no_meta_avg_token_num += len(tokenizer.encode(test_docuemtns_no_meta[i].page_content))
  meta_avg_token_num += len(tokenizer.encode(test_documents_meta[i].page_content))

no_meta_avg_token_num /= 100
meta_avg_token_num /= 100
print(f"No Meta Avg Token Num: {no_meta_avg_token_num}")
print(f"Meta Avg Token Num: {meta_avg_token_num}")

No Meta Avg Token Num: 1057.13
Meta Avg Token Num: 1042.13


## Vector Store Init

In [22]:
# FAISS init
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# Embedding Model Selection
os.environ["OPENAI_API_KEY"] = userdata.get('openAI')
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Index Dimension
index_cpu = faiss.IndexFlatL2(len(embeddings.embed_query(test_documents_meta[0].page_content)))

vector_store = FAISS(
    embedding_function=embeddings,
    index=index_cpu,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [24]:
# Index Dimension
index_cpu = faiss.IndexFlatL2(len(embeddings.embed_query(test_documents_meta[0].page_content)))

new_vector_store = FAISS(
    embedding_function=embeddings,
    index=index_cpu,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

## Batch
- Splitting Docuements into multiple batches
- Append new Documents into existing vector store

### Splitting Documents

In [25]:
batch_size = 700
batches = [test_documents_meta[i:i+batch_size] for i in range(0, len(test_documents_meta), batch_size)]
batches_id = [uuids[i:i+batch_size] for i in range(0, len(uuids), batch_size)]
print(f"Number of documents: {len(test_documents_meta)}")
print(f"Batch Size(Number of documents in one batch): {batch_size}")
print("Expected TPM: %.2f"% (batch_size*meta_avg_token_num))
print(f"Number of batches: {len(batches)}")

Number of documents: 21164
Batch Size(Number of documents in one batch): 700
Expected TPM: 729491.00
Number of batches: 31


## Vector Store Merging

In [26]:
print(vector_store.index.ntotal)
print(new_vector_store.index.ntotal)

0
0


In [27]:
vector_store.add_documents(documents=batches[0], ids=batches_id[0])
print(vector_store.index.ntotal)

700


In [28]:
vector_store.add_documents(documents=batches[1], ids=batches_id[1])
print(vector_store.index.ntotal)

1400


In [29]:
new_vector_store.add_documents(documents=batches[2], ids=batches_id[2])
print(new_vector_store.index.ntotal)

700


In [30]:
vector_store.merge_from(new_vector_store)
print(vector_store.index.ntotal)

2100


## Duplication Check & append

In [35]:
from time import sleep

# Batch 순회
existing_texts = {doc.page_content for doc in vector_store.docstore._dict.values()}
for i in range(len(batches)):
  print(f"Batch {i+1}/{len(batches)}")
  checked_batch = []
  checked_batch_id = []
  dup_count = 0
  # Batch duplication check with original dataset or saved data
  for j in range(len(batches[i])):
    if batches[i][j].page_content not in existing_texts:
      checked_batch.append(batches[i][j])
      checked_batch_id.append(batches_id[i][j])
    else:
      dup_count+=1
  print("Duplicate Found: {}".format(dup_count))
  print("New Embeddings: {}".format(len(checked_batch)))
  if (len(checked_batch) == 0):
    continue
  vector_store.add_documents(documents=checked_batch, ids=checked_batch_id)
  print("Current Vector Store Size: ", vector_store.index.ntotal)
  sleep(5)

Batch 1/31
Duplicate Found: 700
New Embeddings: 0
Batch 2/31
Duplicate Found: 700
New Embeddings: 0
Batch 3/31
Duplicate Found: 700
New Embeddings: 0
Batch 4/31
Duplicate Found: 0
New Embeddings: 700
Batch 5/31
Duplicate Found: 0
New Embeddings: 700
Batch 6/31
Duplicate Found: 0
New Embeddings: 700
Batch 7/31
Duplicate Found: 0
New Embeddings: 700
Batch 8/31
Duplicate Found: 0
New Embeddings: 700
Batch 9/31
Duplicate Found: 0
New Embeddings: 700
Batch 10/31
Duplicate Found: 0
New Embeddings: 700
Batch 11/31
Duplicate Found: 0
New Embeddings: 700
Batch 12/31
Duplicate Found: 0
New Embeddings: 700
Batch 13/31
Duplicate Found: 0
New Embeddings: 700
Batch 14/31
Duplicate Found: 0
New Embeddings: 700
Batch 15/31
Duplicate Found: 0
New Embeddings: 700
Batch 16/31
Duplicate Found: 0
New Embeddings: 700
Batch 17/31
Duplicate Found: 0
New Embeddings: 700
Batch 18/31
Duplicate Found: 0
New Embeddings: 700
Batch 19/31
Duplicate Found: 0
New Embeddings: 700
Batch 20/31
Duplicate Found: 0
New Embed

In [37]:
print(vector_store.index.ntotal)
vector_store.save_local(faiss_index_path)

21164


## Final Optimized Vector Store Data Ingestion

In [None]:
from time import sleep

# Batch 순회
existing_texts = {doc.page_content for doc in vector_store.docstore._dict.values()}
for i in range(len(batches)):
  print(f"Batch {i+1}/{len(batches)}")
  checked_batch = []
  checked_batch_id = []
  dup_count = 0
  # Batch duplication check with original dataset or saved data
  for j in range(len(batches[i])):
    if batches[i][j].page_content not in existing_texts:
      checked_batch.append(batches[i][j])
      checked_batch_id.append(batches_id[i][j])
    else:
      dup_count+=1
  print("Duplicate Found: {}".format(dup_count))
  print("New Embeddings: {}".format(len(checked_batch)))

  if (len(checked_batch) == 0):
    continue

  try:
    vector_store.add_documents(documents=checked_batch, ids=checked_batch_id)
    print("Current Vector Store Size: ", vector_store.index.ntotal)
    sleep(5)
  except Exception as e:
    vector_store.save_local(faiss_index_path)
    print(e)