In [None]:
import json
import datasets
from datasets import list_datasets



In [None]:
import psutil

def print_memory():
  # Get memory usage information
  memory = psutil.virtual_memory()

  # Print memory usage statistics
  print(f"Total Memory: {memory.total / (1024**3):.2f} GB")
  print(f"Available Memory: {memory.available / (1024**3):.2f} GB")
  print(f"Used Memory: {memory.used / (1024**3):.2f} GB")
  print(f"Memory Usage Percentage: {memory.percent:.2f}%")

In [None]:
print_memory()

all_datasets = list_datasets()
from datasets import load_dataset

squad_kor_v2 = load_dataset('squad_kor_v2')

validation_data = squad_kor_v2['validation']
df_valid = validation_data.to_pandas()

print_memory()

In [None]:
df_embedding = df_valid[:100]
del df_valid
del squad_kor_v2
del validation_data


In [None]:
df_embedding

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

torch.device('cuda:0')

In [None]:
import gc


def tokenize_and_embedding(sentences):
  model = AutoModel.from_pretrained('BM-K/KoSimCSE-roberta-multitask')  # or 'BM-K/KoSimCSE-bert-multitask'
  tokenizer = AutoTokenizer.from_pretrained('BM-K/KoSimCSE-roberta-multitask')  # or 'BM-K/KoSimCSE-bert-multitask'
  print('tokenzing...')
  inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
  print('embedding...')
  embeddings, _ = model(**inputs, return_dict=False)
  result = []
  for emb_cnt in range(len(embeddings)):
    result.append(embeddings[emb_cnt][0].detach().numpy())
  del model
  del tokenizer
  del inputs
  del embeddings
  gc.collect()
  return result


In [None]:
import numpy as np

def cal_score(a, b):
  dot_product = np.dot(a, b)
  norm1 = np.linalg.norm(a)
  norm2 = np.linalg.norm(b)

  return dot_product / (norm1 * norm2)



In [None]:
LOCAL_QUATA_LIMIT = 20

def make_embeddings_with_local(df_embedding):
  context_result_list = []
  question_result_list = []
  for group_cnt in range(0, len(df_embedding) // LOCAL_QUATA_LIMIT):
    print(group_cnt)
    print_memory()
    df_target = df_embedding[group_cnt * LOCAL_QUATA_LIMIT:(group_cnt + 1) * LOCAL_QUATA_LIMIT]
    context_result_list.extend(tokenize_and_embedding(df_target['context'].to_list()))
    question_result_list.extend(tokenize_and_embedding(df_target['question'].to_list()))
  return context_result_list, question_result_list

In [None]:
context_result_list_bmk, question_result_list_bmk = make_embeddings_with_local(df_embedding)

In [None]:
import pandas as pd

#df_context_latest = pd.DataFrame(context_result_list_bmk)
#df_question_latest = pd.DataFrame(question_result_list_bmk)

In [None]:
def make_inner_product(df_context_result,df_question_result):
  num_rows = len(df_context_result)
  inner_product_matrix = np.zeros((num_rows, num_rows))
  for i in range(num_rows):
    for j in range(num_rows):
      inner_product_matrix[i,j] = cal_score(df_context_result[i],df_question_result[j])
  return inner_product_matrix

def make_diagonal_elements(inner_product_matrix):
  list_same_context_question_product = []
  list_different_context_question_product = []
  for question_index in range(len(inner_product_matrix)):
    context_value = inner_product_matrix[question_index, question_index]
    list_same_context_question_product.append(context_value)
    for context_index in range(len(inner_product_matrix)):
      similarity = inner_product_matrix[context_index, question_index]
      if context_value == similarity:
        continue
      list_different_context_question_product.append(similarity)
  return list_same_context_question_product, list_different_context_question_product

In [None]:
inner_product_matrix_latest = make_inner_product(context_result_list_bmk, question_result_list_bmk)
list_same_cq_sim_latest, list_dif_cq_sim_latest = make_diagonal_elements(inner_product_matrix_latest)

In [None]:
pd.DataFrame(list_same_cq_sim_latest).describe()

In [None]:
pd.DataFrame(list_dif_cq_sim_latest).describe()

In [None]:
pd.DataFrame(inner_product_matrix_latest)