## 0. Import libraries

In [14]:
# May not need all the imports below, but too lazy to clean up ~
import os
import sys
import numpy as np
from langchain_huggingface import HuggingFaceEmbeddings
import pandas as pd
import re
import csv
import json
import requests
import time
from tabulate import tabulate

from openai import OpenAI
from langchain_core.prompts import PromptTemplate

import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from langchain_community.llms.tongyi import Tongyi
from langchain_community.chat_models.tongyi import ChatTongyi

import nest_asyncio
nest_asyncio.apply()

os.environ['USER_AGENT'] = 'my-req-dedup-agent'

## 1. Utility Functions

### read env variables from ~/.env, because Jubyter seems not to source bashrc, zshrc or any profile

In [5]:
from dotenv import load_dotenv
load_dotenv()    # this is needed, because we store in ~/.env

print(sys.version)
print(sys.path)


3.12.3 (v3.12.3:f6650f9ad7, Apr  9 2024, 08:18:48) [Clang 13.0.0 (clang-1300.0.29.30)]
['/Library/Frameworks/Python.framework/Versions/3.12/lib/python312.zip', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/lib-dynload', '', '/Users/hualei/Library/Python/3.12/lib/python/site-packages', '/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages']


### save an array into a json file

In [6]:
def save_array_to_json(array, file_name):

    with open(file_name, mode='w') as file:
        json.dump(array, file)

    print(f"Save to {file_name} successfully!\n")
    
    return

### load an array from a json file

In [7]:
def load_array_from_json(file_name):
    # 从 JSON 文件中读取一维数组
    with open(file_name, mode='r') as file:
        array = json.load(file)

    return array

### save json data line by line into a file

In [8]:
# 打开文件并逐行写入 JSON 数据
def save_json_in_lines(json_data_list, file_name):
    with open(file_name, 'w') as file:
        for json_data in json_data_list:
            json_line = json.dumps(json_data)  # 将 JSON 对象转换为字符串
            file.write(json_line + '\n')       # 写入文件并换行

### load json data line by line to a json array

In [9]:
# 打开文件并逐行读取每个 JSON 对象
def load_json_in_lines(file_name):
    json_array = []
    with open(file_name, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # 解析 JSON 并添加到数组中
            json_array.append(json_obj)
    
    print("从文件读取的JSON数据：")

    return json_array

### save a 2D array into a csv file

In [10]:
def save_2D_array_to_csv(array, file_name):
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(array)
    
    return

### load a 2D float array from a csv file

In [11]:
def load_2D_float_array_from_csv(file_name):
    with open(file_name, mode='r') as file:
        reader = csv.reader(file)
        array = [list(map(floag, row)) for row in reader]

    return array

## 2. Calculate the Cosine Similarity, based on embedding of each requirement

### Prepare the embedding model
@todo: We may try different models later

In [13]:
def prepare_embedding_model():
    # Embed
    model_name = "sentence-transformers/all-mpnet-base-v2"
    model_kwargs = {'device': 'cpu'}
    encode_kwargs = {'normalize_embeddings': False}
    hf = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    
    return hf

### Calculate embedding for each row of a csv file, based on "subject" and "description"

In [36]:
def calc_embedding_from_csv(csv_file_name, embedding_model):
    # Read each record
    df = pd.read_csv(csv_file_name)

    # 打印前几条需求数据
    #print(df.head())

    # init the embeddings
    req_embeddings = []

    # 遍历每一行, 计算embedding
    for index, row in df.iterrows():
        if (index > 0) and (index % 100 == 0):
            print(f"\nFinshed calculating embedding for {index} Requirements: \n")
        
        req = row.to_dict()
        #print(f'req: {req}')
        sub_and_desc = '--Subject--\n' + str(req['subject']) + '\n\n--Description--\n' + str(req['description'])
        #if (index > 0) and (index % 20 == 0):
        #    print(f"\n ===> sub_and_desc is {sub_and_desc} ===> \n")
        embed = embedding_model.embed_query(sub_and_desc)
        #print(f'Requirement NO.{index+1}: embed: {embed}\n')
        req_embeddings.append(embed)
    
        #print(f'----- len(req_embeddings) = {len(req_embeddings)} -----\n')

    # 记录需求总数
    NUM_OF_REQ = index+1
    print(f'======= NUM_OF_REQ = {NUM_OF_REQ} =======\n')

    return req_embeddings

### Calculate correlation coefficient based on every 2 record's embedding

In [37]:
def calc_corr(embeddings_array):
    NUM_OF_REQ = len(embeddings_array)

    # 初始化相关系数数组
    rows, cols = NUM_OF_REQ, NUM_OF_REQ
    req_cor = [[0 for _ in range(cols)] for _ in range(rows)]
    
    # 遍历计算每两个需求之间的相似度（点积）
    for i in range(NUM_OF_REQ):
        if (i > 0) and ((i+1) % 100 == 0):
            print(f"\nCalculating correlation coefficient for Requirement NO.{i + 1}: \n")
        req_cor[i][i] = 1
        for j in range(i):
            req_cor[i][j] = req_cor[j][i] = np.dot(req_embeddings[i], req_embeddings[j])

    return req_cor

### Calculate L2 norm for all the embeddings

In [38]:
def calc_l2_norm(embeddings_array):
    NUM_OF_REQ = len(embeddings_array)

    # 初始化相关系数数组
    l2_norm = [0 for _ in range(NUM_OF_REQ)]
    
    # 遍历计算每两个需求之间的相似度（点积）
    for i in range(NUM_OF_REQ):
        l2_norm[i] = np.linalg.norm(embeddings_array[i])

    return l2_norm

### Calculate Cosine Similarity

In [39]:
def calc_consine_similarity(req_cor, l2_norm):
    NUM_OF_REQ = len(l2_norm)
    assert NUM_OF_REQ == len(req_cor)

    consine_similarity = [[0 for _ in range(NUM_OF_REQ)]for _ in range(NUM_OF_REQ)]
    for i in range(NUM_OF_REQ):
        for j in range(NUM_OF_REQ):
            consine_similarity[i][j] = consine_similarity[j][i] = req_cor[i][j]/(l2_norm[i] * l2_norm[j])

    return consine_similarity

## 3. Use batch API of TongYi, to find whether two requirements are duplicated

### Form the prompt string with the two requirements in json

In [40]:
def form_query(req1, req2):
    
    query_content = '''
    Please compare the following two requirements, with subject and description, and tell me whether they are very similar and should be duplicated.
    Please reply with the following format:
    * Probability: a number between 0% to 100%, showing how much you recommend to set the two tickets to duplicated
    * Analysis: Provide your detailed recommendation
    * New Requirement: If the probability is > 70%, draft a new requirement to combine the old two requirements, with
    ** Subject: <the new subject>
    ** Description: <the new description>
    '''
    
    req1__sub_and_desc = '--Subject--\n' + str(req1['subject']) + '\n\n--Description--\n' + str(req1['description'])
    req2__sub_and_desc = '--Subject--\n' + str(req2['subject']) + '\n\n--Description--\n' + str(req2['description'])

    query_content = query_content + '\n\n-- Ticket1\n' + req1__sub_and_desc + '\n\n-- Ticket2\n' + req2__sub_and_desc

    return query_content

### Parse Response from TongYi, to get 'Probability', 'Analysis', 'New Requirement'

In [41]:
def parse_rsp(text):
    #pattern = r'\* Probability:\s*(.*?)\n\* Analysis:\s*(.*?)\n\* New Requirement:\s*(.*?)\n'
    #matches = re.search(pattern, text, re.DOTALL)

    # 使用正则表达式匹配, 允许匹配多行 (>0.5时，每段前面有'*')
    pattern = r'\* Probability:\s*(.*?)\s*\n\* Analysis:\s*([\s\S]*?)\s*\n\* New Requirement:\s*([\s\S]*)'
    matches = re.search(pattern, text, re.MULTILINE)

    if not matches:
        # 使用正则表达式匹配, 允许匹配多行 (<=0.5时，每段前面没有'*')
        pattern = r'Probability:\s*(.*?)\s*\nAnalysis:\s*([\s\S]*?)\s*\nNew Requirement:\s*([\s\S]*)'
        matches = re.search(pattern, text, re.MULTILINE)

    if matches:
        return {
            'Probability': matches.group(1).strip(),
            'Analysis': matches.group(2).strip(),
            'New Requirement': matches.group(3).strip()
        }
    else:
        return {
            'Probability': None,
            'Analysis': None,
            'New Requirement': None
        }

### Call TongYi to judge the dup - single query

In [42]:
from http import HTTPStatus
import dashscope
import os

def call_tongyi(query_content, stream_mode = False):
    dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")
    
    messages = [
        {'role': 'user', 'content': query_content}]


    rsp_str = ''
    
    if stream_mode:
        # streaming mode. If processing time is long, this can be used to get partial response when still processing
        responses = dashscope.Generation.call("qwen-max",
                                    messages=messages,
                                    result_format='message',  # set the result to be "message"  format.
                                    stream=True, # set streaming output
                                    incremental_output=True  # get streaming output incrementally
                                    )
        for response in responses:
            if response.status_code == HTTPStatus.OK:
                #print(response.output.choices[0]['message']['content'],end='')
                rsp_str = rsp_str + response.output.choices[0]['message']['content']
            else:
                print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
                    response.request_id, response.status_code,
                    response.code, response.message
                ))
    else:
        # no need to use streaming mode. See whether this can save processing time
        response = dashscope.Generation.call("qwen-max",
                                    messages=messages,
                                    result_format='message'
                                    )
        if response.status_code == HTTPStatus.OK:
            #print(response.output.choices[0]['message']['content'],end='')
            rsp_str = response.output.choices[0]['message']['content']
        else:
            print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
                response.request_id, response.status_code,
                response.code, response.message
            ))
    

    rsp_json_data = parse_rsp(rsp_str)

    #print(rsp_str, '\n\n')
    #print('----->>> Compared with parsed data ----->>> \n')
    #print(f'{rsp_json_data}')

    return rsp_json_data

In [43]:
### test ... function()

# The source requirements file
df = pd.read_csv('all_issues_for_test.csv')

# Test 1
req1 = df.iloc[435].to_dict()
req2 = df.iloc[170].to_dict()
query_content = form_query(req1, req2)
rsp_json_data = call_tongyi(query_content, stream_mode = False)
print(f'Test 1: rsp_json_data = {rsp_json_data}')

# Test 2
req1 = df.iloc[1].to_dict()
req2 = df.iloc[10].to_dict()
query_content = form_query(req1, req2)
rsp_json_data = call_tongyi(query_content, stream_mode = False)
print(f'Test 2: rsp_json_data = {rsp_json_data}')

# Test 2
req1 = df.iloc[0].to_dict()
req2 = df.iloc[66].to_dict()
query_content = form_query(req1, req2)
rsp_json_data = call_tongyi(query_content, stream_mode = False)
print(f'Test 3: rsp_json_data = {rsp_json_data}')


Test 1: rsp_json_data = {'Probability': '95%', 'Analysis': 'Both tickets essentially request the same core functionality, which is the ability to import issues from a CSV file into multiple projects simultaneously, with the project identifier being specified in a dedicated column within the CSV. The descriptions vary slightly but convey the same user need for improved efficiency when importing issues across different projects.', 'New Requirement': "* Subject: Enhance CSV Import to Support Multiple Projects\n    * Description: Enhance the existing issue import feature to facilitate the import of issues from a single CSV file into multiple projects. This enhancement will include the option to map a designated column in the CSV to the 'Project' field, enabling users to specify the target project for each issue directly within the import file. This improvement aims to streamline the import process and allow users to efficiently manage issue creation across different projects in a single ac

### Generate batch request files

In [44]:
def generate_batch_request_files(src_req_csv_file_name, cor_json_file_name):
    # How many requests in a batch
    BATCH_SIZE = 500
    INPUT_FILE_PREFIX = 'test/input_files_1/req_batch_input'
    
    # The source requirements file
    df = pd.read_csv(src_req_csv_file_name)
    #print(df.head())
    
    # The source corr file
    cor_data = load_array_from_json(cor_json_file_name)
    NUM_OF_REQ = len(cor_data)
    #print(f'=== NUM_OF_REQ = {NUM_OF_REQ} ===\n')
    #print(f'cor_data[0] = {cor_data[0]}\n')
    
    # initialize
    count = 0
    file_index = 0
    rows = []
    
    # prepare the data for every pair or requirements, with correlation coefficient > 0.5
    for i in range(NUM_OF_REQ):
        for j in range(i):
            #print(f'Doing i = {i}, j = {j}\n')
            
            if cor_data[i][j] > 0.5:
                count += 1
                
                custom_id = str(count)
                req1 = df.iloc[i].to_dict()
                req2 = df.iloc[j].to_dict()
                query_content = form_query(req1, req2)
                row = {	"custom_id": custom_id, 
                    	"method": "POST", 
                    	"url": "/v1/chat/completions", 
                    	"body": {	"model": "qwen-max", 
                    				"messages": [{"role": "user", "content": query_content}]
                    			}
                      }
                #print(f'row = {row}')
                #print(f'row = {rows}')
                rows.append(row)


            if (count == BATCH_SIZE):
                file_index += 1
                file_name = INPUT_FILE_PREFIX + str(file_index) + '.json'
                save_json_in_lines(rows, file_name)
                print(f'Finished writing to file: {file_name}')

                count = 0
                rows = []

                # for debugging
                #if file_index == 3:
                #    return

    if (count > 0) and (count <= BATCH_SIZE):
        file_index += 1
        file_name = INPUT_FILE_PREFIX + str(file_index) + '.json'
        save_json_in_lines(rows, file_name)
        print(f'Finished writing to file: {file_name}')

        count = 0
        rows = []
                


In [45]:
# test
src_req_csv_file_name = 'all_issues_for_test.csv'
cor_json_file_name = 'req_cor.json'

generate_batch_request_files(src_req_csv_file_name, cor_json_file_name)

Finished writing to file: test/input_files_1/req_batch_input1.json
Finished writing to file: test/input_files_1/req_batch_input2.json
Finished writing to file: test/input_files_1/req_batch_input3.json
Finished writing to file: test/input_files_1/req_batch_input4.json
Finished writing to file: test/input_files_1/req_batch_input5.json
Finished writing to file: test/input_files_1/req_batch_input6.json
Finished writing to file: test/input_files_1/req_batch_input7.json
Finished writing to file: test/input_files_1/req_batch_input8.json
Finished writing to file: test/input_files_1/req_batch_input9.json
Finished writing to file: test/input_files_1/req_batch_input10.json
Finished writing to file: test/input_files_1/req_batch_input11.json
Finished writing to file: test/input_files_1/req_batch_input12.json
Finished writing to file: test/input_files_1/req_batch_input13.json
Finished writing to file: test/input_files_1/req_batch_input14.json
Finished writing to file: test/input_files_1/req_batch_in

### async mode: can send multiple prompts and wait for response in async mode
But as we test, this does not help to reduce the response time when sending prompts to TongYi

In [46]:
import asyncio
import platform
from dashscope.aigc.generation import AioGeneration

import dashscope
dashscope.api_key = os.getenv("DASHSCOPE_API_KEY")

import nest_asyncio
nest_asyncio.apply()

# for debugging usage
async def task(question):
    #print(f"Sending question: {question}")
    #response = await AioGeneration.call("qwen-turbo", prompt=question)
    messages = [
        {'role': 'user', 'content': question}]
    response = dashscope.Generation.call("qwen-max",
                                messages=messages,
                                result_format='message'
                                )
    #print(response)

    return response

# 主异步函数
async def main(questions):
    #questions = ["你是谁？", "你会什么？", "天气怎么样？"]
    #questions = [str1, str2, str3]
    tasks = [task(q) for q in questions]
    return await asyncio.gather(*tasks)



### Query and Save -- async

In [47]:
#debug with a few records
DEBUG_STOP_AT = 10
def query_and_save(src_req_csv_file_name, cor_json_file_name, result_csv_file_name):
    # How many requests in a batch
    BATCH_SIZE = 5
    
    # 记录开始时间
    start_time = time.time()


    # The source requirements file
    df = pd.read_csv(src_req_csv_file_name)
    
    # 1. read cor from cor_json_file_name
    cor_data = load_array_from_json(cor_json_file_name)
    NUM_OF_REQ = len(cor_data)
    print(f'-------- array size: {NUM_OF_REQ} x {NUM_OF_REQ} --------\n')


    # 打开 CSV 文件进行写入
    header = ['Index1', 'Issue1 ID', 'Issue1 Subject', 'Issue1 Description',
            'Index2', 'Issue2 ID', 'Issue2 Subject', 'Issue2 Description',
            'Embedding Corr', 'LLM Probability', 'Analysis', 'New Requirement']

    questions = []
    infos = []
    with open(result_csv_file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        # 写入表头
        writer.writerow(header)
        print(f'\nHeader is saved:\n{header}')
    
        # 遍历计算每两个需求之间的相似度（点积）
        check_count = 0
        record_count = 0
        num_in_batch = 0
        
        for i in range(NUM_OF_REQ):
            for j in range(i):
                #print(f'Doing i = {i}, j = {j}\n')
                check_count += 1
                if (check_count % 500 == 0):
                    # for debug
                    #if check_count == 1000:
                    #    return
                    
                    # 记录结束时间
                    end_time = time.time()
                    # 计算并输出耗时
                    elapsed_time = end_time - start_time
                    print(f'Now we have checked {check_count} of pairs, in {elapsed_time:.4f} seconds\n')
                
                if(cor_data[i][j] > 0.5):
                    print(f'handling cor_data[{i}][{j}] = {cor_data[i][j]}\n')
                    req1 = df.iloc[i].to_dict()
                    req2 = df.iloc[j].to_dict()
                    query_content = form_query(req1, req2)
                    req_info = {
                        'Index1':i, 'Issue1 ID':req1['id'], 
                        'Issue1 Subject':req1['subject'], 'Issue1 Description':req1['description'],
                        'Index2':j, 'Issue2 ID':req2['id'], 
                        'Issue2 Subject':req2['subject'], 'Issue2 Description':req2['description'],
                        'Embedding Corr':cor_data[i][j]
                    }
                    #print(f'query_content = {query_content}\n')
                    #print(f'req_info = {req_info}\n')
                    
                    '''
                    # this is single call
                    rsp_json_data = call_tongyi_with_stream(query_content)
                    row = [i, req1['id'],req1['subject'],req1['description'],
                           j, req2['id'],req2['subject'],req2['description'],
                           cor_data[i][j],rsp_json_data['Probability'],
                           rsp_json_data['Analysis'],rsp_json_data['New Requirement']
                          ]
                    writer.writerow(row)
                    record_count += 1
                    '''

                    # this is batch async call
                    questions.append(query_content)
                    infos.append(req_info)
                    num_in_batch += 1
                    #print(f'num_in_batch = {num_in_batch}\n')

                    # num_in_batch controls
                    if num_in_batch == BATCH_SIZE:
                        if __name__ == '__main__':
                            #print(f'Will do async call now for i = {i}, j = {j}')
                            #asyncio.get_event_loop().run_until_complete(main(questions, infos, writer))
                            results = asyncio.get_event_loop().run_until_complete(main(questions))

                            result_idx = 0
                            for response in results:
                                #print(f'\nAsync call is returned, response = \n{response}\n\n')
                                if response.status_code == HTTPStatus.OK:
                                    #print(response.output.choices[0]['message']['content'],end='')
                                    rsp_str = response.output.choices[0]['message']['content']

                                    rsp_json_data = parse_rsp(rsp_str)
                                    info = infos[result_idx]
                                    row = [info['Index1'], info['Issue1 ID'], info['Issue1 Subject'], info['Issue1 Description'],
                                           info['Index2'], info['Issue2 ID'], info['Issue2 Subject'], info['Issue2 Description'],
                                           info['Embedding Corr'],
                                           rsp_json_data['Probability'], rsp_json_data['Analysis'],rsp_json_data['New Requirement']
                                          ]
                                    writer.writerow(row)
                                else:
                                    print('Request id: %s, Status code: %s, error code: %s, error message: %s' % (
                                        response.request_id, response.status_code,
                                        response.code, response.message
                                    ))
                                    
                                result_idx += 1
                                
                            #print(f'async call is done: {result_idx+1} lines are saved')
                        
                        else:  # if __name__ == '__main__':
                            print(f'not called in __main__, do nothing')

                        record_count += BATCH_SIZE
                        num_in_batch = 0
                        questions = []
                        infos = []
                        
                    else:  # if num_in_batch == BATCH_SIZE:
                        continue
                    

                    if (record_count % 10 == 0):
                        # 记录结束时间
                        end_time = time.time()
                        # 计算并输出耗时
                        elapsed_time = end_time - start_time
                        print(f'Now we have saved {record_count} results, in {elapsed_time:.4f} seconds\n')

                    if record_count >= DEBUG_STOP_AT:
                        return
                    

                    # for test purpose
                    #if (record_count >= 30):
                    #    return

    # 记录结束时间
    end_time = time.time()
    # 计算并输出耗时
    elapsed_time = end_time - start_time
    
    print(f'Totally {record_count} results are saved to {result_csv_file_name}, in {elapsed_time:.4f} seconds')
    return
     




In [48]:
# test async mode, and output to file 
query_and_save('all_issues_for_test.csv', 'req_cor.json', 'temp_result.csv')

-------- array size: 743 x 743 --------


Header is saved:
['Index1', 'Issue1 ID', 'Issue1 Subject', 'Issue1 Description', 'Index2', 'Issue2 ID', 'Issue2 Subject', 'Issue2 Description', 'Embedding Corr', 'LLM Probability', 'Analysis', 'New Requirement']
handling cor_data[9][3] = 0.5248227168451436

handling cor_data[14][4] = 0.5009113902169107

handling cor_data[14][9] = 0.5587199102244673

handling cor_data[18][14] = 0.5172098519420762

handling cor_data[19][5] = 0.5160936046872969

handling cor_data[23][10] = 0.5368007963526455

handling cor_data[23][12] = 0.7090704143593299

handling cor_data[24][9] = 0.5124424719444702

handling cor_data[25][6] = 0.6336000244467357

handling cor_data[26][18] = 0.541367402405958

Now we have saved 10 results, in 114.6025 seconds



### Main Program

In [52]:
# Files for Query TongYi
src_req_csv_file_name = 'all_issues_for_test.csv'
embedding_json_file_name = 'req_embeddings.json'
cor_json_file_name = 'req_cor.json'
l2_norm_json_file_name = 'req_l2_norm.json'
cosine_similarity_json_file_name = 'req_cosine_similarity.json'
result_csv_file_name = 'result.csv'

# macro to control whether need recalculate the embeddings and the correlation coefficient
NeedRecalc = False
VERBASE_MODE = False


# if embedding or cor file doesn't exist, force recalc
if not os.path.exists(embedding_json_file_name):
    NeedRecalc = True
if not os.path.exists(cor_json_file_name):
    NeedRecalc = True

print(f'\n\n=== NeedRecalc: {NeedRecalc} ===\n\n')

if NeedRecalc:
    # input csv file which contains all requirements
    csv_file_name = 'all_issues_for_test.csv'
    
    # Prepare the embedding
    embedding_model = prepare_embedding_model()
    
    # Calc the embeddings
    req_embeddings = calc_embedding_from_csv(csv_file_name, embedding_model)
    
    # Calc the correlation coefficient
    req_cor = calc_corr(req_embeddings)

    # save embedding and cor
    save_array_to_json(req_embeddings, embedding_json_file_name)
    save_array_to_json(req_cor, cor_json_file_name)
    
    # calc l2_norm and consine_similarity
    l2_norm = calc_l2_norm(req_embeddings)
    consine_similarity = calc_consine_similarity(req_cor, l2_norm)
    # save l2_norm and consine_similarity
    save_array_to_json(l2_norm, l2_norm_json_file_name)
    save_array_to_json(consine_similarity, cosine_similarity_json_file_name)
else:
    req_embeddings = load_array_from_json(embedding_json_file_name)
    req_cor = load_array_from_json(cor_json_file_name)
    print(f'type(req_embeddings) = {type(req_embeddings)}')
    print(f'type(req_cor) = {type(req_cor)}')

    l2_norm = load_array_from_json(l2_norm_json_file_name)
    consine_similarity = load_array_from_json(cosine_similarity_json_file_name)
    print(f'type(l2_norm) = {type(l2_norm)}')
    print(f'type(consine_similarity) = {type(consine_similarity)}')

NUM_OF_REQ = len(req_embeddings)

# 打印超过0.5的相关系数
total_cor_num = 0
for i in range(NUM_OF_REQ):
    print_row_num = False
    for j in range(i):
        if req_cor[i][j] > 0.5:
            total_cor_num += 1
            if (VERBASE_MODE):    
                if not print_row_num:
                    print_row_num = True
                    print(f'\nRow: NO.{i}:')
                print(f'[{i}][{j}] = {req_cor[i][j]}')
                
print(f"\n\n ~~~~~ Totally {total_cor_num} for correlation coefficient > 0.5 ~~~~~ \n\n")


# 打印超过0.5的consine similarity
total_consine_similarity_num = 0
for i in range(NUM_OF_REQ):
    print_row_num = False
    for j in range(i):
        if consine_similarity[i][j] > 0.5:
            total_consine_similarity_num += 1
            if (VERBASE_MODE):    
                if not print_row_num:
                    print_row_num = True
                    print(f'\nRow: NO.{i}:')
                print(f'[{i}][{j}] = {req_cor[i][j]}')
    
print(f"\n\n ~~~~~ Totally {total_consine_similarity_num} for consine similarity > 0.5 ~~~~~ \n\n")






=== NeedRecalc: False ===


type(req_embeddings) = <class 'list'>
type(req_cor) = <class 'list'>
type(l2_norm) = <class 'list'>
type(consine_similarity) = <class 'list'>


 ~~~~~ Totally 7154 for correlation coefficient > 0.5 ~~~~~ 




 ~~~~~ Totally 7154 for consine similarity > 0.5 ~~~~~ 


