# Env Setting
- Python Virtual Env
  - .mvp_rag_db_env

In [None]:
# importing google drive for the session
from google.colab import drive
drive.mount('/content/drive')

In [7]:
# Creating a virtual env just once
# %cd /content/drive/MyDrive
# !apt install python3.10-venv
# !python3 -m venv .mvp_rag_db_env

In [44]:
# Activate the env
!source /content/drive/MyDrive/.mvp_rag_db_env/bin/activate

# Package Installation
- torch
- transformers
- faiss-cpu
- numpy
- pandas
- langchain
- sqlalchemy

In [None]:
!pip install torch transformers faiss-cpu numpy pandas langchain sqlalchemy

# JSON Prep
1. Load Json

In [46]:
# Load Json
import json
import os

JSON_DIR_PATH = '/content/drive/MyDrive/JSON'
HA_RAG_DATA_DIR_PATH = os.path.join(JSON_DIR_PATH, 'HA_RAG_DATA')

json_files = os.listdir(HA_RAG_DATA_DIR_PATH)

current_json_file = os.path.join(HA_RAG_DATA_DIR_PATH,json_files[0])

current_data = None
if (os.path.exists(current_json_file)):
    with open(current_json_file, 'r') as f:
        current_data = json.load(f)
else:
  print(f"Can't find {json_files[0]}")

# Tokenizing & Embedding: KoBert
1. Load KoBert: Embedding Model and Tokenizer
2. Json to Vectors
  - Tokenization: json to tokenized text
  - Embeddings Generation: tokenized json to vector

In [None]:
# KoBert Tokenization Installed
# !pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

In [None]:
# Load KoBert: Embedding Model and Tokenizer
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

import torch
import numpy as np


tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = BertModel.from_pretrained('skt/kobert-base-v1')

In [63]:
# Json to Vectors
# Tokenization: json to tokenized text
fields = [
    "ad_gu",
    "ad_dong",
    "address",
    "location",
    "description",
    "rating",
    "share_link",
    "reviews",
    "info"
]
current_data_meta = current_data[0]

text = f"{current_data_meta['ad_gu']} [SEP] {current_data_meta['ad_dong']} [SEP] {current_data_meta['address']} [SEP] {current_data_meta['location']} [SEP] " \
           f"{current_data_meta['description']} [SEP] {current_data_meta['rating']} [SEP] {current_data_meta['share_link']} [SEP] " \
           f"{' '.join(current_data_meta['reviews'])} [SEP] {current_data_meta['info']}"

text_clean = text.replace('\n', ' ')

# Tokenize the text
inputs = tokenizer(text, return_tensors='pt', padding='max_length', truncation=True, max_length=512)


# Embeddings Generation: tokenized json to vector
with torch.no_grad():
  outputs = model(**inputs)
  embedding = outputs.last_hidden_state[:, 0, :]
  embedding = embedding.squeeze().numpy()


# FAISS for Vector Search
- FAISS creating an index for vectors
- RDBMS metadata -> not now

In [69]:
import faiss
import numpy as np

flatten_embedding = np.array(embedding).astype('float32')

if flatten_embedding.ndim == 1:
    flatten_embedding = flatten_embedding.reshape(1, -1)

embedding_index = faiss.IndexFlatL2(flatten_embedding.shape[1])
embedding_index.add(flatten_embedding)





# Testing with LangChain