# Env Setting
- Python Virtual Env
  - .mvp_rag_db_env

In [None]:
# importing google drive for the session
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Creating a virtual env just once
# %cd /content/drive/MyDrive
# !apt install python3.10-venv
# !python3 -m venv .mvp_rag_db_env

In [None]:
# Activate the env
!source /content/drive/MyDrive/.mvp_rag_db_env/bin/activate

# Package Installation
- torch
- transformers
- faiss-cpu
- numpy
- pandas
- langchain
- sqlalchemy

In [None]:
!pip install torch transformers faiss-cpu numpy pandas langchain sqlalchemy



# JSON Prep
1. Load Json
  - `json_dir(dir_path)`: Creating list of file path to json in a given directory
  - `json_load(file_path)`: Getting JSON of given file path

In [None]:
# Load Json
import json
import os

def json_dir(dir_path):
  json_files = os.listdir(dir_path)
  json_files_path = [os.path.join(dir_path, file) for file in json_files]
  return json_files_path


def json_load(file_path):
  current_data = None
  if (os.path.exists(file_path)):
    with open(file_path, 'r') as f:
      current_data = json.load(f)
  else:
    print(f"Can't find {file_path}")
  return current_data

# Tokenizing & Embedding: KoBert
1. Load KoBert: Embedding Model and Tokenizer
2. Json to Vectors
  - Tokenization: json to tokenized text
    - `json_tokenization(data, fields, tokenizer)`: Creating a list of tokenized json data by list of json data
  - Embeddings Generation: tokenized json to vector
    - `tokens_to_vectors(inputs, model)`: Creating a list of embeddings by a list of tokenized json

In [None]:
# KoBert Tokenization Installed
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-fyz5e92s/kobert-tokenizer_798163efbc9f410c8e0d237e3108dda3
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-fyz5e92s/kobert-tokenizer_798163efbc9f410c8e0d237e3108dda3
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [None]:
# Load KoBert: Embedding Model and Tokenizer
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

import torch
import numpy as np


tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = BertModel.from_pretrained('skt/kobert-base-v1').to('cuda')
model_cpu = BertModel.from_pretrained('skt/kobert-base-v1')


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [None]:
# Json to Vectors
# Tokenization: json to tokenized text
def json_tokenization(data, tokenizer):
  inputs = []
  for row in data:
      text = f"{row['ad_gu']} [SEP] {row['ad_dong']} [SEP] {row['address']} [SEP] {row['location']} [SEP] " \
               f"{row['description']} [SEP] {row['rating']} [SEP] {row['share_link']} [SEP] " \
               f"{' '.join(row['reviews'])} [SEP] {row['info']}"
      text_clean = text.replace('\n', ' ')
      # Tokenize and move input tensors to GPU
      tokenized_input = tokenizer(text_clean, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
      tokenized_input = {key: value.to('cuda') for key, value in tokenized_input.items()}  # Move inputs to GPU
      inputs.append(tokenized_input)
  return inputs

def json_tokenization_cpu(data, tokenizer):
  inputs = []
  for row in data:
      text = f"{row['ad_gu']} [SEP] {row['ad_dong']} [SEP] {row['address']} [SEP] {row['location']} [SEP] " \
               f"{row['description']} [SEP] {row['rating']} [SEP] {row['share_link']} [SEP] " \
               f"{' '.join(row['reviews'])} [SEP] {row['info']}"
      text_clean = text.replace('\n', ' ')
      tokenized_input = tokenizer(text_clean, return_tensors='pt', padding='max_length', truncation=True, max_length=512)
      inputs.append(tokenized_input)
  return inputs

# Embeddings Generation: tokenized json to vector
def tokens_to_vectors(inputs, model):
  model.to('cuda')
  embeddings = []
  for input in inputs:
    with torch.no_grad():
      outputs = model(**input)
      embedding = outputs.last_hidden_state[:, 0, :]
      embedding = embedding.squeeze().cpu().numpy()
      embeddings.append(embedding)
  return embeddings

def tokens_to_vectors_cpu(inputs, model):
  embeddings = []
  for input in inputs:
    with torch.no_grad():
      outputs = model(**input)
      embedding = outputs.last_hidden_state[:, 0, :]
      embedding = embedding.squeeze().numpy()
      embeddings.append(embedding)
  return embeddings

# FAISS for Vector Search
- FAISS creating an index for vectors
  - `faiss_index_gen(embeddings, index_dir_path)`: Creating a index file of given embeddings
- RDBMS metadata -> not now

In [None]:
import faiss

def faiss_index_gen(embeddings, index_dir_path):
  embedding_dim = len(embeddings[0])
  index = faiss.IndexFlatL2(embedding_dim)
  embeddings_array = np.array(embeddings).astype('float32')
  index.add(embeddings_array)
  faiss.write_index(index, index_dir_path)




# Json to Vector DB
- `json_dir(dir_path)`: Creating list of file path to json in a given directory
- `json_load(file_path)`: Getting JSON of given file path
- `json_tokenization(data, fields, tokenizer)`: Creating a list of tokenized json data by list of json data
- `tokens_to_vectors(inputs, model)`: Creating a list of embeddings by a list of tokenized json
- `faiss_index_gen(embeddings, index_dir_path)`

In [None]:
# Variables
## json_dir, json_load
JSON_DIR_PATH = '/content/drive/MyDrive/JSON'
HA_RAG_DATA_DIR_PATH = os.path.join(JSON_DIR_PATH, 'HA_RAG_DATA')
HA_RAG_INDEX_DIR_PATH = os.path.join(JSON_DIR_PATH, 'HA_RAG_INDEX')
if not os.path.exists(HA_RAG_DATA_DIR_PATH):
  os.makedirs(HA_RAG_DATA_DIR_PATH)

if not os.path.exists(HA_RAG_INDEX_DIR_PATH):
  os.makedirs(HA_RAG_INDEX_DIR_PATH)

json_files_path = json_dir(HA_RAG_DATA_DIR_PATH)

## json_tokenizations
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

import torch
import numpy as np


tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
model = BertModel.from_pretrained('skt/kobert-base-v1')

inputs = None
embeddings = None
for json_file_path in json_files_path:
  print(f"Processing {json_file_path}")
  current_data = json_load(json_file_path)
  print(f"Data length: {len(current_data)}")
  print(f"Tokenizing.....")
  inputs = json_tokenization(current_data, tokenizer)
  print(f"Embedding.....")
  embeddings = tokens_to_vectors(inputs, model)
  print("Saving......")
  faiss_index_gen(embeddings, os.path.join(HA_RAG_INDEX_DIR_PATH, json_file_path.split("/")[-1].split(".")[0]+".index"))
  print("Done")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


Processing /content/drive/MyDrive/JSON/HA_RAG_DATA/SeongBuk_common.json
Data length: 990
Tokenizing.....
Embedding.....
Saving......
Done


# Testing with LangChain

TypeError: FAISS.__init__() missing 2 required positional arguments: 'docstore' and 'index_to_docstore_id'