# Step 1: Utility functions to prepare the requirement data file: 

# Get redmine requirement data

In [23]:
import pandas as pd
import re
import csv
import json
import requests
import time

def query_and_save_duplicated_issues(src_file, dst_file, verbose = False):
    # 读取CSV文件
    df = pd.read_csv(src_file)
    
    # 打印前几条需求数据
    if verbose:
        print(df.head())
    
    # init the json_data (a list of dict)
    json_data = []
    
    # 遍历每一行
    for index, row in df.iterrows():
        if (index + 1) % 20 == 0:
            print(f'{index+1} records are processed.')
            
        # debug info flag
        if ((index + 1) % 20 == 0) and verbose:
            debug_info_flag = True
        else:
            debug_info_flag = False
        
        req = row.to_dict()
        related_issues = req['Related issues']
    
        # 使用split方法按逗号分隔字符串
        if (isinstance(related_issues, str)):
            related_issue_strings = related_issues.split(',')
        else:
            if debug_info_flag:
                print(f"Requirement {index + 1}: related issues info is NOT a string")
            related_issue_strings = []    # if 'Related issues' is empty, the value may be float type Nan

        if debug_info_flag:
            print(f"\n\nProcessing Requirement NO.{index + 1}: \n")
            print(f"Related issues: {related_issues}")
            print(f"After split, related_issue_strings = {related_issue_strings}")
    
    
        for issue_string in related_issue_strings:
            # 使用正则表达式提取数字
            duplicate_number = -1
            if (isinstance(issue_string, str)):
                match = re.search(r'Is duplicate of #(\d+)', issue_string)
                if (not match):
                    match = re.search(r'Has duplicate #(\d+)', issue_string)
                if (not match):
                    match = re.search(r'Copied from #(\d+)', issue_string)
            
                # 检查是否匹配成功
                if match:
                    duplicate_number = match.group(1)
                    #print(f"The duplicate ticket number is: {duplicate_number}")
                else:
                    if debug_info_flag:
                        print(f"Requirement {index + 1}: substring {issue_string}: is NOT duplicate relation")
            else:
                if debug_info_flag:
                    print(f"Requirement {index + 1}: substring {issue_string}: is NOT a string")
        
            # This row has no duplicated number
            if (-1 == duplicate_number):
                continue
        
            # 发送 GET 请求, 获取related issue
            req = 'https://www.redmine.org/issues/'
            req = req + duplicate_number + '.json?include=relations'
        
            response = requests.get(req)
            #print("response.status_code: ", response.status_code)
        
            # 检查请求是否成功
            if response.status_code == 200:
                # 处理响应内容
                data_out = response.json()  # 如果响应是 JSON 格式
                data_issue = data_out['issue']
                if debug_info_flag:
                    formatted_json = json.dumps(data_out, indent=4)
                    print(f'data_out = {formatted_json}')
        
                relation_json = data_issue['relations']
                # 构建duplicate信息
                dup_str = ''
                count = 0
                for rel in relation_json:
                    # 只提取dup的relation信息，忽略其他：如related
                    #print('\n###relation_type = ', rel['relation_type'])
                    if rel['relation_type'] == 'duplicates':
                        #print(rel)
    
                        # redmine differentiates 'duplicates' and 'duplicate to'
                        if rel['issue_id'] == duplicate_number:
                            dup_id = rel['issue_to_id']
                        else:
                            dup_id = rel['issue_id']
                            
                        if debug_info_flag:
                            print(f'\n*** dup_id = {dup_id}')
                        
                        if count > 0:
                            dup_str = dup_str + ', '
                        dup_str = dup_str + 'Is duplicate of #' + str(dup_id)
                        #print(f"\n--- dup_str is {dup_str} ---\n\n")
        
                        count = count + 1
        
        
                
                # 重构一个json，只包含我们需要的数据
                data = {
                    'id':data_issue['id'],
                    'subject':data_issue['subject'],
                    'author':data_issue['author']['name'],
                    'status':data_issue['status']['name'],
                    'related issues':dup_str,
                    'description':data_issue['description']
                }
                
                #print(data)
                #print(type(data))
                json_data.append(data)
            else:
                print(f"Error: For Requirement {index + 1}, when trying to get dup issue, get {response.status_code}")
    
    
    
    # write data of the duplicated issue into the file
    fieldnames = ["id", "subject", "author", "status", "related issues", "description"]
    with open(dst_file, mode='w', newline='', encoding='utf-8') as file:
        
        # 创建 CSV 写入器
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # 写入 CSV 文件的表头
        writer.writeheader()
        
        # 遍历每条 JSON 数据
        for entry in json_data:
            # 提取所需字段，如果字段不存在则使用默认值 None 或空字符串
            row = {field: entry.get(field, None) for field in fieldnames}
            # 写入到 CSV 文件
            writer.writerow(row)


In [24]:
src_file = 'issues-closed-duplicated.csv'
dst_file = 'dup_issues_downloaded.csv'

query_and_save_duplicated_issues(src_file, dst_file, verbose = False)

20 records are processed.
40 records are processed.
60 records are processed.
80 records are processed.
100 records are processed.
120 records are processed.
140 records are processed.
160 records are processed.
180 records are processed.
200 records are processed.


In [None]:
### sort and dedup a csv file

In [None]:
def sort_and_dedup_csv(input_csv_name, output_csv_name, sort_col_name, dedup_col_name):
    # 读取 CSV 文件
    df = pd.read_csv(input_csv_name)
    
    # 按指定字段排序，例如按 'column_name' 字段
    df_sorted = df.sort_values(by=sort_col_name)
    
    # 根据某个字段去重，例如按 'column_name' 字段去重
    df_unique = df_sorted.drop_duplicates(subset=sort_col_name)
    
    # 将结果保存到新的 CSV 文件中
    df_unique.to_csv(output_csv_name, index=False)

In [None]:
### used to sort and remove duplicated records. shouldn't do harm if run multiple times

In [None]:
# sort_and_dedup_csv('all_issues_for_test-backup.csv', 'all_issues_for_test,csv', 'id', 'id')