## Candidate Embedding 구하고 VectorDB에 인덱싱하기

### 1. candidate_model 가지고 item feature 를 candidate embedding으로 변환하기

#### 1-1. candidate_model 로드

In [29]:
import boto3

AWS_PROFILE = 'default'
BUCKET_NAME = 'jimin-model'
session = boto3.Session(profile_name=AWS_PROFILE)
s3_client = session.client('s3')

In [30]:
from utils.load_model import LoadModel
import tensorflow as tf

load_model = LoadModel(s3_client, BUCKET_NAME)
candidate_model = load_model.call("candidate_model/1")

#### 1-2. feature 테이블에서 가져온 데이터 가공해서 item dateset 준비

In [31]:
import mysql.connector as sql

import pandas as pd

host = 'ssm-develop.db.sinsang.market'
port = 3306
username = 'dealicious'
password = 'tlstkd12!@'
database_name = 'dealicious'

db_connection = sql.connect(host=host, database=database_name, user=username, password=password)

retrievals_df = pd.read_sql('SELECT * FROM rec_retrievals', con=db_connection)

retrievals_df.head(3)

  retrievals_df = pd.read_sql('SELECT * FROM rec_retrievals', con=db_connection)


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,month_sin,month_cos,age,garment_group_name,index_group_name
0,1537401600000000000,0095c9b47fc950788bb709201f024c5338838a27c59c02...,710390001,0.019051,1,-0.866025,-0.5,20.0,Skirts,Divided
1,1537401600000000000,0095c9b47fc950788bb709201f024c5338838a27c59c02...,633130019,0.016932,1,-0.866025,-0.5,20.0,Jersey Fancy,Divided
2,1537401600000000000,0095c9b47fc950788bb709201f024c5338838a27c59c02...,671057002,0.008458,1,-0.866025,-0.5,20.0,Jersey Fancy,Divided


In [32]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(retrievals_df, test_size=0.2, random_state=42)

In [33]:
train_df["article_id"] = train_df["article_id"].astype(str)
test_df["article_id"] = test_df["article_id"].astype(str)

In [34]:
input_signature = candidate_model.signatures["serving_default"].structured_input_signature[-1]
candidate_features = list(input_signature.keys())

# Get list of unique candidate items.
item_df = train_df[candidate_features]
item_df.drop_duplicates(subset="article_id", inplace=True)

item_ds = tf.data.Dataset.from_tensor_slices(
    {col: item_df[col] for col in item_df})


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  item_df.drop_duplicates(subset="article_id", inplace=True)


#### 1-3. candidate embedding 계산하기

In [35]:
# Compute embeddings for all candidate items.
candidate_embeddings = item_ds.batch(448).map(
    lambda x: (x["article_id"], candidate_model(x)))

### 2. vector db에 candidate embedding 인덱싱하기

In [52]:
from pymilvus import MilvusClient, DataType
client = MilvusClient(uri='http://milvus.dev.sinsang.market:19530')

schema = MilvusClient.create_schema(
    auto_id=False,
    enable_dynamic_field=True,
)
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=16)
index_params = client.prepare_index_params()
index_params.add_index(
    field_name="vector",
    index_name="candidate_index", 
    index_type="HNSW",
    metric_type="IP",
    params={"M":48, "efConstruction": 256}
)
client.create_collection(collection_name="rec_candidate", schema=schema, index_params=index_params)

In [53]:
from pymilvus import Collection

data = []
for batch in candidate_embeddings:
    item_id_list, embedding_list = batch
    item_id_list = item_id_list.numpy().astype(int)
    embedding_list = embedding_list.numpy()

    for item_id, embedding in zip(item_id_list, embedding_list):
        data.append({
            "id": item_id,
            "vector": embedding
        })
    
    client.insert(collection_name="rec_candidate", data=data)

In [None]:
# import requests
# import json
# import weaviate

# # weaviate schdema name
# INDEX_NAME = 'ItemCandidate'
# weaviate_client = weaviate.Client("https://weaviate.dev.sinsang.market")

In [25]:
# # 스키마 목록
# response = weaviate_client.schema.get()
# classes = [c['class'] for c in response['classes']]
# print(classes)

# weaviate_client.query.aggregate('ItemCandidate').with_meta_count().do()

['Article', 'Dealidocsall5', 'ItemCandidate']


{'data': {'Aggregate': {'ItemCandidate': [{'meta': {'count': 6487}}]}}}

In [10]:
# # batch import embeddings
# def bulk_insert(item_id_list, embedding_list):
#     weaviate_client.batch.configure(batch_size=100)
#     with weaviate_client.batch as batch:
#         for item_id, embedding in zip(item_id_list, embedding_list):
#             properties = {
#                 "item_id": int(item_id)
#             }
#             batch.add_data_object(properties, INDEX_NAME, vector=embedding.tolist())
  

In [24]:
# for batch in candidate_embeddings:
#     item_id_list, embedding_list = batch
#     item_id_list = item_id_list.numpy().astype(int)
#     embedding_list = embedding_list.numpy()

#     bulk_insert(item_id_list, embedding_list)

### 3. 샘플 쿼리

In [2]:
import numpy as np

emb_dim = 16
embedding = np.random.rand(emb_dim)

In [69]:
res = client.search(
  collection_name="rec_candidate", 
  data=[embedding], 
  ann_fields="vector",
  limit=20,
  output_fields=["id"]
  )
print(json.dumps(res, indent=2))
print([item['id'] for item in res[0]])

[
  [
    {
      "id": 636809002,
      "distance": 22.911287307739258,
      "entity": {
        "id": 636809002
      }
    },
    {
      "id": 501619009,
      "distance": 22.480873107910156,
      "entity": {
        "id": 501619009
      }
    },
    {
      "id": 634037001,
      "distance": 22.37549591064453,
      "entity": {
        "id": 634037001
      }
    }
  ]
]
[636809002, 501619009, 634037001]


In [None]:
# k = 10
# nearVector = {
#     "vector": embedding
# }
# result = weaviate_client.query.get(
#         INDEX_NAME, ["item_id"]
#     ).with_near_vector(
#         nearVector
#     ).with_limit(k).with_additional("vector").do()


# print(json.dumps(result, indent=4))

In [22]:
# 스키마 삭제
# weaviate_client.schema.delete_class(INDEX_NAME)