# Test 1

In this script, we will calculate similarity between 'question' and 'context' embedding values came from Vertex AI Gecko Embedding API. 

At fist, I will use 'textembedding-gecko@001'.

In [None]:
# ! pip install google-cloud-aiplatform
# ! pip install torch
# ! pip install transformers

In [None]:
import json
import datasets
from datasets import list_datasets

all_datasets = list_datasets()


In [None]:
list = [x for x in all_datasets if 'quad' in x]
print(list)

# squad_kor_v2

In [None]:
from datasets import load_dataset

squad_kor_v2 = load_dataset('squad_kor_v2')

In [None]:

validation_data = squad_kor_v2['validation']
df_valid = validation_data.to_pandas()

In [None]:
from google.auth import default

# Without Scopes, you will see the error. 
credentials, _ = default(scopes=["https://www.googleapis.com/auth/cloud-platform"])

In [None]:
import os

PROJECT_NUMBER = os.getenv("PROJECT_NUMBER")
VERTEX_AI_LOCATION = "us-central1"


In [None]:
import vertexai

vertexai.init(project=PROJECT_NUMBER, location=VERTEX_AI_LOCATION, credentials=credentials)
parameters = {
    "temperature" : 0.2,
    "max_output_tokens" : 1024, 
    "top_p": 0.8, 
    "top_k" : 10
}

In [None]:
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [None]:
df_embedding = df_valid[:200]

In [None]:
df_embedding['question']

In [None]:
QUATA_LIMIT = 5

context_result_list = []
answer_result_list = []

for group_cnt in range(0, len(df_embedding) // QUATA_LIMIT):
  df_target = df_embedding[group_cnt * QUATA_LIMIT:(group_cnt + 1) * QUATA_LIMIT]
  context_result_list.extend(model.get_embeddings(df_target['context']))
  answer_result_list.extend(model.get_embeddings(df_target['question']))

In [None]:
len(context_result_list)

In [None]:
import pandas as pd

df_context_result = pd.DataFrame(context_result_list)
df_question_result = pd.DataFrame(answer_result_list)



In [None]:
df_question_result

In [None]:
import numpy as np

num_rows = len(df_context_result)
inner_product_matrix = np.zeros((num_rows, num_rows))

for i in range(num_rows):
  for j in range(num_rows):
    inner_product_matrix[i,j] = np.inner(df_context_result['values'][i], df_question_result['values'][j])


In [None]:
df_context_result['values']

In [None]:
df_question_result['values']

In [None]:
pd.DataFrame(inner_product_matrix)

In [None]:
# print(df_embedding['context'][0]) # 2016년 하계 올림픽 배구 남자 선수 명단 - 위키백과, 우리 모두의 백과사전
# print(df_embedding['context'][195]) # 니콜라스 베르그루엔 - 위키백과, 우리 모두의 백과사전
# print(df_embedding['question'][1]) # 2016년 하계 올림픽 캐나다 남자 배구 대표팀 선수들 중, 타일러 샌더스는 2015-16년에 어느 팀 소속이었나?

In [None]:
# In this dataset, there are duplicated context records which have different questions. 

list_same_context_question_product = []
list_different_context_question_product = []

for question_index in range(len(inner_product_matrix)):
  context_value = inner_product_matrix[question_index, question_index]
  list_same_context_question_product.append(context_value)
  for context_index in range(len(inner_product_matrix)):
    similarity = inner_product_matrix[context_index, question_index]
    if context_value == similarity:
      continue
    list_different_context_question_product.append(similarity)


In [None]:
## Right Context / Question pair similarity
pd.DataFrame(list_same_context_question_product).describe()

In [None]:
pd.DataFrame(list_different_context_question_product).describe()

## Test 2.

New version Gecko Embedding API will be tested.

In [None]:

def make_embeddings(model, df_embedding):
  context_result_list = []
  question_result_list = []
  for group_cnt in range(0, len(df_embedding) // QUATA_LIMIT):
    df_target = df_embedding[group_cnt * QUATA_LIMIT:(group_cnt + 1) * QUATA_LIMIT]
    context_result_list.extend(model.get_embeddings(df_target['context']))
    question_result_list.extend(model.get_embeddings(df_target['question']))
  return context_result_list, question_result_list
    

In [None]:
new_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@latest")

context_result_list_latest, question_result_list_latest = make_embeddings(new_model, df_embedding)

# 300 sec - 200 call 1.5 sec latency (per 5 batch)

In [None]:
df_context_latest = pd.DataFrame(context_result_list_latest)
df_question_latest = pd.DataFrame(question_result_list_latest)

In [None]:
def make_inner_product(df_context_result,df_question_result):
  num_rows = len(df_context_result)
  inner_product_matrix = np.zeros((num_rows, num_rows))
  for i in range(num_rows):
    for j in range(num_rows):
      inner_product_matrix[i,j] = np.inner(df_context_result['values'][i], df_question_result['values'][j])
  return inner_product_matrix

def make_diagonal_elements(inner_product_matrix):
  list_same_context_question_product = []
  list_different_context_question_product = []
  for question_index in range(len(inner_product_matrix)):
    context_value = inner_product_matrix[question_index, question_index]
    list_same_context_question_product.append(context_value)
    for context_index in range(len(inner_product_matrix)):
      similarity = inner_product_matrix[context_index, question_index]
      if context_value == similarity:
        continue
      list_different_context_question_product.append(similarity)
  return list_same_context_question_product, list_different_context_question_product

In [None]:
inner_product_matrix_latest = make_inner_product(df_context_latest, df_question_latest)
list_same_cq_sim_latest, list_dif_cq_sim_latest = make_diagonal_elements(inner_product_matrix_latest)

In [None]:
pd.DataFrame(list_same_cq_sim_latest).describe()

In [None]:
pd.DataFrame(list_dif_cq_sim_latest).describe()

In [None]:
new_multilang_model = TextEmbeddingModel.from_pretrained("textembedding-gecko-multilingual@latest")

context_result_list_multilang, question_result_list_multilang = make_embeddings(new_multilang_model, df_embedding)


In [None]:
df_context_multilang = pd.DataFrame(context_result_list_multilang)
df_question_multilang = pd.DataFrame(question_result_list_multilang)

In [None]:
inner_product_matrix_multilang = make_inner_product(df_context_multilang, df_question_multilang)
list_same_cq_sim_multilang, list_dif_cq_sim_multilang = make_diagonal_elements(inner_product_matrix_multilang)

In [None]:
pd.DataFrame(list_same_cq_sim_multilang).describe()

In [None]:
pd.DataFrame(list_dif_cq_sim_multilang).describe()