### Import libs

In [1]:
import inspect
import numpy as np
import pandas as pd

## 1. Utilities

### 1.1 functions for loading files

In [2]:
import json
# 打开文件并逐行读取每个 JSON 对象
def load_json_in_lines(file_name):
    json_array = []
    with open(file_name, 'r') as file:
        for line in file:
            json_obj = json.loads(line.strip())  # 解析 JSON 并添加到数组中
            #print(json_obj)
            json_array.append(json_obj)
    
    #print("从文件读取的JSON数据：")

    return json_array

In [3]:
import json
def load_array_from_json(file_name):
    # 从 JSON 文件中读取一维数组
    with open(file_name, mode='r') as file:
        array = json.load(file)

    return array

### 1.2 function to get duplicated IDs from the string in relations

In [4]:
# test regexp
import re

# 使用正则表达式匹配特定模式后的数字
def get_dup_ids(text, verbose = False):
    matches = []
    if isinstance(text, str):    # 输入有可能不是字符串，可能是float类型的nan
        matches = re.findall(r'(?:Is duplicate of #|Copied from #|Has duplicate #)(\d+)', text)

    # 输出匹配结果
    if verbose:
        print(f'In {inspect.currentframe().f_code.co_name}: text = {text}')
        print(f'matches = {matches}')

    return matches
    

In [5]:
# test function get_dup_ids()
texts = [
'''Is duplicate of #12002, Is duplicate of #35509''',
'''Is duplicate of #10230, Related to #18857, Copied from #33325''',
'''Blocked by #38273, Has duplicate #15148, Follows #37987, Is duplicate of #10091''',
'''Related to #18857''',
'''Has duplicate #15148''',
'''Copied from #33325''',
'''Blocked by #38273''',
'''Follows #37987''',
'''Is duplicate of #10091''',
]

for text in texts:
    get_dup_ids(text, verbose = True)

In get_dup_ids: text = Is duplicate of #12002, Is duplicate of #35509
matches = ['12002', '35509']
In get_dup_ids: text = Is duplicate of #10230, Related to #18857, Copied from #33325
matches = ['10230', '33325']
In get_dup_ids: text = Blocked by #38273, Has duplicate #15148, Follows #37987, Is duplicate of #10091
matches = ['15148', '10091']
In get_dup_ids: text = Related to #18857
matches = []
In get_dup_ids: text = Has duplicate #15148
matches = ['15148']
In get_dup_ids: text = Copied from #33325
matches = ['33325']
In get_dup_ids: text = Blocked by #38273
matches = []
In get_dup_ids: text = Follows #37987
matches = []
In get_dup_ids: text = Is duplicate of #10091
matches = ['10091']


## 2. Retrive base and predict

### 2.1 utility: given the issue id, try to find index in the input file

In [6]:
# find the index of the record, by issue_id
def find_index_by_issue_id(src_req_csv_file_name, issue_id, verbose = False):
    # The source requirements file
    df = pd.read_csv(src_req_csv_file_name)
    NUM_OF_REQ = df.shape[0]

    if verbose:
        print(f'NUM_OF_REQ = {NUM_OF_REQ}')
        print(f'issue_id: type = {type(issue_id)}, value = {issue_id}')

    for i  in range(NUM_OF_REQ):
        req = df.iloc[i].to_dict()
        if int(issue_id) == int(req['id']):
            return i

    return -1

### 2.2 retrieve the base array, which stores the duplication of the example datas
input: src_req_csv_file_name is a csv file, each line contains a requirement
output: base[i][j] = 1 means req[i] and req[j] are duplicated with each other

In [7]:

def retrieve_base(src_req_csv_file_name, verbose = False):
    # How many requests in a batch
    BATCH_SIZE = 500
    
    # The source requirements file
    df = pd.read_csv(src_req_csv_file_name)
    NUM_OF_REQ = df.shape[0]
    print(f'=== NUM_OF_REQ = {NUM_OF_REQ} ===\n')

    #print(df.head())

    base = np.zeros((NUM_OF_REQ, NUM_OF_REQ))
    for i  in range(NUM_OF_REQ):
        req = df.iloc[i].to_dict()
        issue_id = req['id']    # string type
        related_issues = req['related issues']    # string type

        if verbose:
            print(f'issue_id = {issue_id}')
            print(f'related_issues = {related_issues}')
        
        issue_index = find_index_by_issue_id(src_req_csv_file_name, req['id'], verbose)    # int type

        #print(f'i = {i}\n')
        #if i == 126:
        #    print(f'related issues: type = {type(related_issues)}, value = {related_issues}')
        matches = get_dup_ids(related_issues, verbose)    # string type
        for match in matches:
            match_index = find_index_by_issue_id(src_req_csv_file_name, match, verbose)    # int type
            if issue_index == match_index:
                print(f'Abnormal: find duplicated info with identical issue id: {issue_id}')
                continue
            
            if issue_index == -1:
                print(f'Abnormal: find issue index = -1, for issue id: {issue_id}')
                continue

            if match_index == -1:
                # normal case, the duplicated ticket is not in the test data set
                continue

            if verbose:
                print(f'Find duplicated: issue_index = {issue_index}, match_index = {match_index}')
            base[issue_index][match_index] = 1
            base[match_index][issue_index] = 1

    return base


In [8]:
# Call retrieve_base
src_req_csv_file_path = 'all_issues_for_test.csv'
base = retrieve_base(src_req_csv_file_path, verbose = False)

# print(f'base = {base}')

NUM_OF_REQ = len(base)
base_dup_count = 0
for i in range(NUM_OF_REQ):
    for j in range(i):
        if base[i][j] == 1:
            print(f'base[{i}][{j}] = 1\n')
            base_dup_count += 1

print(f'base_dup_count = {base_dup_count}\n')

=== NUM_OF_REQ = 743 ===

Abnormal: find duplicated info with identical issue id: 408
Abnormal: find duplicated info with identical issue id: 684
Abnormal: find duplicated info with identical issue id: 5005
Abnormal: find duplicated info with identical issue id: 5325
Abnormal: find duplicated info with identical issue id: 13296
Abnormal: find duplicated info with identical issue id: 14341
base[66][0] = 1

base[102][35] = 1

base[105][91] = 1

base[106][25] = 1

base[108][25] = 1

base[109][61] = 1

base[110][104] = 1

base[111][87] = 1

base[112][57] = 1

base[113][18] = 1

base[115][11] = 1

base[116][30] = 1

base[117][77] = 1

base[119][34] = 1

base[121][17] = 1

base[122][51] = 1

base[124][46] = 1

base[125][107] = 1

base[128][97] = 1

base[129][66] = 1

base[130][88] = 1

base[131][26] = 1

base[132][32] = 1

base[133][83] = 1

base[134][36] = 1

base[137][53] = 1

base[138][85] = 1

base[139][57] = 1

base[140][99] = 1

base[141][72] = 1

base[142][64] = 1

base[143][76] = 1



### 2.3 retrieve the predict array, which stores the duplication of the example datas
* predict[i][j] = 1 means: A prompt is sent, and LLM says req[i] and req[j] are duplicated with each other
* predict[i][j] = 0 means: A prompt is sent, and LLM says req[i] and req[j] are NOT duplicated with each other
* predict[i][j] = -1 means: No prompt is sent, for req[i] and req[j] 

In [438]:

def retrieve_predict(cor_json_file_name, output_json_file_path, verbose = False):
    output_data = load_json_in_lines(output_json_file_path)
    OUTPUT_LINES = len(output_data)
    cor_data = load_array_from_json(cor_json_file_name)
    NUM_OF_REQ = len(cor_data)

    print(f'OUTPUT_LINES = {OUTPUT_LINES}')
    print(f'NUM_OF_REQ = {NUM_OF_REQ}')

    predict =  [[-1 for _ in range(NUM_OF_REQ)] for _ in range(NUM_OF_REQ)]
    line_num = 0
    dup_num = 0
    for i in range(NUM_OF_REQ):
        for j in range(i):
            if cor_data[i][j] > 0.5:
                if line_num < OUTPUT_LINES:    # should always be the case
                    data = output_data[line_num]
                    content = data['response']['body']['choices'][0]['message']['content']
                    match = re.search(r'\* Probability:\s*(\d+)%', content)
                    if match:
                        probability = int(match.group(1)) / 100.
                    
                    if verbose:
                        #print(f'data = {data}\n')
                        #print(f'content = {content}\n')
                        print(f"Probability: {probability}\n")

                    if (probability > 0.7):
                        predict[i][j] = predict[j][i] = 1
                        dup_num += 1
                    else:
                        predict[i][j] = predict[j][i] = 0

                    if verbose:
                        print(f'predict[{i}][{j}] = {predict[i][j]}\n')
                    line_num += 1
                else:
                    print(f'Abornormal case: line_num:{line_num} > OUTPUT_LINES:{OUTPUT_LINES}\n')
                    return

    print(f'=== line_num = {line_num}\n')
    print(f'=== dup_num = {dup_num}\n')
    
    return predict, line_num

In [440]:
# test retrieve_predict
cor_json_file_name = 'req_cor.json'
output_json_file_path = 'test/output_files_1/req_batch_output_all.json'

predict, num_of_examples = retrieve_predict(cor_json_file_name, output_json_file_path, verbose = True)



OUTPUT_LINES = 7154
NUM_OF_REQ = 743
Probability: 0.1

predict[9][3] = 0

Probability: 0.05

predict[14][4] = 0

Probability: 0.1

predict[14][9] = 0

Probability: 0.0

predict[18][14] = 0

Probability: 0.05

predict[19][5] = 0

Probability: 0.2

predict[23][10] = 0

Probability: 0.1

predict[23][12] = 0

Probability: 0.1

predict[24][9] = 0

Probability: 0.2

predict[25][6] = 0

Probability: 0.1

predict[26][18] = 0

Probability: 0.05

predict[26][24] = 0

Probability: 0.0

predict[27][23] = 0

Probability: 0.0

predict[28][4] = 0

Probability: 0.1

predict[28][6] = 0

Probability: 0.0

predict[30][12] = 0

Probability: 0.1

predict[31][15] = 0

Probability: 0.1

predict[32][28] = 0

Probability: 0.1

predict[36][15] = 0

Probability: 0.0

predict[36][31] = 0

Probability: 0.15

predict[38][36] = 0

Probability: 0.1

predict[39][11] = 0

Probability: 0.1

predict[39][23] = 0

Probability: 0.1

predict[39][25] = 0

Probability: 0.05

predict[39][28] = 0

Probability: 0.1

predict[39][3

### Calculate TP (True Positive), FN (False Negative) and FP (False Positive)

In [447]:
# base and predict are 2D arrays, which store the base data and the predicted result
# since it's for duplication, base[i][j] = base[j][i], and predict[i][j] = predict[j][i]
# and base[i][i] = 1, which we do not care
def calc_tp_fn_fp(base, predict, verbose = False):
    data_num = len(base)
    assert data_num == len(predict) == len(base[0]) == len(predict[0])

    print(f'data_num = {data_num}')
    
    tp = 0
    fn = 0
    fp = 0
    tn = 0
    predict_category = [["" for _ in range(data_num)] for _ in range(data_num)]

    cor_json_file_name = 'req_cor.json'
    cor_data = load_array_from_json(cor_json_file_name)
    
    for i in range(data_num):
        for j in range(i):
            if verbose:
                print(f'In {inspect.currentframe().f_code.co_name}: base[{i}][{j}] = {base[i][j]}, predict[{i}][{j}] = {predict[i][j]}')
            if base[i][j] == 1:
                if predict[i][j] == 1:
                    tp += 1
                    predict_category[i][j] = predict_category[j][i] = 'tp'
                elif predict[i][j] == 0:
                    fn += 1
                    predict_category[i][j] = predict_category[j][i] = 'fn-queried'
                elif predict[i][j] == -1:
                    fn += 1
                    predict_category[i][j] = predict_category[j][i] = 'fn-no-query'
                    print(f'For i = {i}, j = {j}, it is a duplicate in base, but we did not query')
                    print(f'cor_data[{i}][{j}] = {cor_data[i][j]}')
                else:
                    print(f'Abnormal: base[{i}][{j}]= {base[i][j]}, base[{i}][{j}]= {base[i][j]}, should not happen!!!')
                    
            elif base[i][j] == 0:
                if predict[i][j] == 1:
                    fp += 1
                    predict_category[i][j] = predict_category[j][i] = 'fp'
                elif predict[i][j] == 0:
                    tn += 1
                    predict_category[i][j] = predict_category[j][i] = 'tn-queried'
                elif predict[i][j] == -1:
                    tn += 1
                    predict_category[i][j] = predict_category[j][i] = 'tn-no-query'
                else:
                    print(f'Abnormal: base[{i}][{j}]= {base[i][j]}, base[{i}][{j}]= {base[i][j]}, should not happen!!!')
            else:
                print(f'Abnormal: base[{i}][{j}]= {base[i][j]}, should not happen!!!')
        
    return tp, fn, fp, tn, predict_category

### 重建完整信息

### save json data line by line into a csv file

In [425]:
import csv
import json

# 打开文件并逐行写入 JSON 数据
def save_jsons_array_into_csv(json_array, csv_file_path):

    # 提取 JSON 对象的字段名称作为 CSV 文件的表头
    fieldnames = json_array[0].keys()
    
    # 写入 CSV 文件
    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # 写入表头
        writer.writeheader()
        
        # 写入 JSON 数组中的每个对象
        for json_obj in json_array:
            writer.writerow(json_obj)
    
    print(f"JSON data has been written to {csv_file_path}")

### test save_jsons_array_into_csv

In [426]:
json_array = [
    {"name": "Alice", "age": 30, "city": "New York"},
    {"name": "Bob", "age": 25, "city": "San Francisco"},
    {"name": "Charlie", "age": 35, "city": "Los Angeles"}
]

# 指定要写入的 CSV 文件路径
csv_file_path = 'output_json_in_csv.csv'

save_jsons_array_into_csv(json_array, csv_file_path)

JSON data has been written to output_json_in_csv.csv


### form all the records for tp, fn, and fp

In [477]:
def save_all_records_for_tp_fn_fp(src_req_csv_file_name, output_json_file_path,
                                  cor_json_file_name, csv_file_for_analysis,
                                  predict_category, include_tn_queried = False, verbose = False):
    rows = []

    NUM_OF_REQ = len(base)
    assert NUM_OF_REQ == len(predict) == len(base[0]) == len(predict[0])
    print(f'NUM_OF_REQ = {NUM_OF_REQ}')

    df = pd.read_csv(src_req_csv_file_name)
    output_data = load_json_in_lines(output_json_file_path)
    cor_data = load_array_from_json(cor_json_file_name)
    
    query_num = 0
    for i in range(NUM_OF_REQ):
        for j in range(i):
            cat = predict_category[i][j]

            # If queried, get probability, analysis and new requirement
            if (cat == 'tp') or (cat == 'fn-queried') or (cat == 'fp') or (cat == 'tn-queried'):
                data = output_data[query_num]
                content = data['response']['body']['choices'][0]['message']['content']
                if verbose:
                    print(f'data = {data}')
                    print(f'content = {content}')

                probability_match = re.search(r'\* Probability:\s*(\d+)%', content)
                if probability_match:
                    probability = int(probability_match.group(1)) / 100.
                    if verbose:
                        print(f'probability = {probability}')

                analysis_pattern = r"\* Analysis:(.*?)(?=\*\sNew Requirement:|\Z)"
                analysis_match = re.search(analysis_pattern, content, re.DOTALL)
                if analysis_match:
                    analysis_string = analysis_match.group(1).strip()
                    if verbose:
                        print(f'analysis_string = {analysis_string}')
                
                new_requirement_pattern = r"\* New Requirement:(.*)"
                new_requirement_match = re.search(new_requirement_pattern, content, re.DOTALL)
                if new_requirement_match:
                    new_requirement_string = new_requirement_match.group(1).strip()
                    if verbose:
                        print(f'new_requirement_string = {new_requirement_string}')

                # shouldn't happen
                if not (probability_match and analysis_pattern and new_requirement_pattern):
                    print(f'Abnormal!!! probability_match = {probability_match}')
                    print(f'Abnormal!!! analysis_pattern = {analysis_pattern}')
                    print(f'Abnormal!!! new_requirement_pattern = {new_requirement_pattern}')
                    return

                output_query_num = query_num
                query_num += 1
                output_query_num

            elif (cat == 'fn-no-query') or (cat == 'tn-no-query'):
                output_query_num = -1
                probability = -1
                analysis_string = new_requirement_string = ""
            else:
                # shouldn't happen
                print(f'Abnormal!!! predict_category[{i}][{j}] = {predict_category[i][j]}')
                return

            # we are most interested in tp, fp, fn-queried, fn-no-query (by default tn-queried not needed)
            if (cat == 'tp') or (cat == 'fp') or (cat == 'fn-queried') or (cat == 'fn-no-query') or ((cat == 'tn-queried') and include_tn_queried):
                row = {
                    'query num':output_query_num,
                    'prediction category':cat,
                    'index1':i,
                    'issue1 id':df.iloc[i].to_dict()['id'],
                    'issue1 subject':df.iloc[i].to_dict()['subject'],
                    'issue1 description':df.iloc[i].to_dict()['description'],
                    'index2':j,
                    'issue2 id':df.iloc[j].to_dict()['id'],
                    'issue2 subject':df.iloc[j].to_dict()['subject'],
                    'issue2 description':df.iloc[j].to_dict()['description'],
                    'consine similarity':cor_data[i][j],
                    'predict probability':probability,
                    'predict analysis':analysis_string,
                    'new requirement':new_requirement_string
                }
                rows.append(row)
                

    save_jsons_array_into_csv(rows, csv_file_for_analysis)

### Main: Calculate recall, precission and accuracy

In [478]:

def main(base, predict, num_examples):
    tp, fn, fp, tn, predict_category = calc_tp_fn_fp(base, predict)
    print(f'num_examples = {num_examples}')
    print(f'tp = {tp}, fn = {fn}, fp = {fp}, tn = {tn}')
    assert (tn + tp + fn + fp == num_examples)

    if (tp + fn) > 0:
        recall = tp / (tp + fn)
    else:
        print(f'tp + fn = {tp + fn}!\n')

    if (tp + fp) > 0:
        precission = tp / (tp + fp)
    else:
        print(f'tp + fp = {tp + fp}!\n')

    if num_examples > 0:
        accuracy = (tp + tn) / num_examples
    
    print(f'Recall: {recall}, Precission: {precission}, Accuracy: {accuracy}')

    save_all_records_for_tp_fn_fp('all_issues_for_test.csv', 
                                  'test/output_files_1/req_batch_output_all.json', 
                                  'req_cor.json',
                                  'test/output_files_1/final_for_analysis.csv', 
                                  predict_category,
                                  include_tn_queried = False,
                                  verbose = False)




# call main with real data

In [479]:
EXAMPLE_NUM = 743
main(base, predict, int(EXAMPLE_NUM*(EXAMPLE_NUM-1)/2))

data_num = 743
For i = 102, j = 35, it is a duplicate in base, but we did not query
cor_data[102][35] = 0.4349779613993665
For i = 160, j = 94, it is a duplicate in base, but we did not query
cor_data[160][94] = 0.4915986117157502
For i = 203, j = 19, it is a duplicate in base, but we did not query
cor_data[203][19] = 0.49581300277629514
For i = 227, j = 196, it is a duplicate in base, but we did not query
cor_data[227][196] = 0.380224817842591
For i = 237, j = 154, it is a duplicate in base, but we did not query
cor_data[237][154] = 0.4379590846135061
For i = 238, j = 191, it is a duplicate in base, but we did not query
cor_data[238][191] = 0.3816480687187493
For i = 249, j = 145, it is a duplicate in base, but we did not query
cor_data[249][145] = 0.418408527087397
For i = 249, j = 182, it is a duplicate in base, but we did not query
cor_data[249][182] = 0.4690545853092478
For i = 249, j = 189, it is a duplicate in base, but we did not query
cor_data[249][189] = 0.4918223757174094
Fo

### call main with test data set

In [302]:
test_base = [
    [1, 1, 0, 1],
    [1, 1, 1, 0],
    [0, 1, 1, 1],
    [1, 0, 1, 1]
]
test_predict = [
    [1, 0, 0, 0],
    [0, 1, 1, 1],
    [0, 1, 1, 1],
    [0, 1, 1, 1]
]

main(test_base, test_predict, 6)

data_num = 4
num_examples = 6
tp = 2, fn = 2, fp = 1, tn = 1
Recall: 0.5, Precission: 0.6666666666666666, Accuracy: 0.5
