In [15]:
import re
import numpy as np

def remove_docstrings(code):
    # 使用正则表达式匹配被 """ 包括的部分，并替换为空字符串
    cleaned_code = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
    cleaned_code = re.sub(r"'''.*?'''", '', cleaned_code, flags=re.DOTALL)
    return cleaned_code

def cosine_similarity_matrix(nl_features, code_features):
    # 计算每个特征向量的范数
    nl_norms = np.linalg.norm(nl_features, axis=1, keepdims=True)
    code_norms = np.linalg.norm(code_features, axis=1, keepdims=True)
    
    # 计算点积
    dot_product = np.dot(nl_features, code_features.T)
    
    # 计算余弦相似度矩阵
    cosine_similarity = dot_product / (nl_norms * code_norms.T)
    
    return cosine_similarity

In [3]:
import json
with open("/home/yiming/cophi/projects/fork/CodeBERT/GraphCodeBERT/codesearch/dataset/python/train.jsonl", "r") as f:
    train_dataset = [json.loads(line) for line in f.readlines()]


In [4]:
import json
import os

# 文件路径
json_file_path = '/home/yiming/cophi/training_dynamic/gcb_tokens_temp/Model/Epoch_1/tokenized_code_tokens_train.json'

# 读取JSON文件
with open(json_file_path, 'r', encoding='utf-8') as f:
    code_tokens_data = json.load(f)

# 文件路径
json_file_path = '/home/yiming/cophi/training_dynamic/gcb_tokens_temp/Model/Epoch_1/tokenized_comment_tokens_train.json'


# 读取JSON文件
with open(json_file_path, 'r', encoding='utf-8') as f:
    nl_tokens_data = json.load(f)

In [5]:
import json
input_path = "/home/yiming/cophi/projects/fork/CodeBERT/GraphCodeBERT/codesearch/sorted_label_human_auto.jsonl"
idx_list = []
match_list = []

with open(input_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip().rstrip(',')  # 去除行末的逗号
        json_obj = json.loads(line)
        idx_list.append(json_obj['idx'])
        match_list.append(json_obj['match'])

In [7]:
code_ind = comment_id = comment_ind = 3896
# code_ind = 3032

In [8]:
code_data = code_tokens_data[code_ind]
comment_data = nl_tokens_data[comment_ind]

In [10]:
import numpy as np
ori_valid_code_attention_output_path = os.path.join("/home/yiming/cophi/training_dynamic/features/retri", 'train_code_attention_retri.npy')
ori_valid_nl_attention_output_path = os.path.join("/home/yiming/cophi/training_dynamic/features/retri", 'train_nl_attention_retri.npy')
ori_valid_code_attention_feature = np.load(ori_valid_code_attention_output_path)
ori_valid_nl_attention_feature = np.load(ori_valid_nl_attention_output_path)

In [11]:
# 预设10种高对比度颜色
high_contrast_colors = [
    "#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF",
    "#00FFFF", "#800000", "#008000", "#000080", "#808000"
]

In [12]:
indices = [i for i, value in enumerate(idx_list) if value == comment_id]

# 提取match_list中对应索引的值
match_values = [match_list[i] for i in indices]

# 输出结果
print("Indices of 7045 in idx_list:", indices)
print("Corresponding values in match_list:", match_values)
# 初始化结果列表
comment_list = []
code_list = []

# 遍历 match_values，处理每个 comment 和 code 的区间
for match in match_values[0]:  # 假设 match_values 的结构是 [[...]]
    comment_intervals, code_intervals = match

    # 展开 comment 的所有索引
    comment_indices = []
    for start, end in zip(comment_intervals[::2], comment_intervals[1::2]):
        comment_indices.extend(range(start, end + 1))
    comment_list.append(comment_indices)

    # 展开 code 的所有索引
    code_indices = []
    for start, end in zip(code_intervals[::2], code_intervals[1::2]):
        code_indices.extend(range(start, end + 1))
    code_list.append(code_indices)

# 输出结果
print("Comment indices list:", comment_list)
print("Code indices list:", code_list)


Indices of 7045 in idx_list: [214]
Corresponding values in match_list: [[[[0, 2], [17, 18]]]]
Comment indices list: [[0, 1, 2]]
Code indices list: [[17, 18]]


In [13]:
import numpy as np
from collections import deque
from IPython.display import display, HTML

array = ori_valid_nl_attention_feature[comment_id][1:]
array = array[1:]

token_list1 = nl_tokens_data[comment_id][1:]
doc_snippet = train_dataset[comment_id]["docstring"]
normalized_contributions = (array - array.min()) / (array.max() - array.min())

# 将 token 与对应的贡献值配对，并去掉 token 中的 "Ġ" 符号
tokens_with_contributions = deque([(token.replace("Ġ", ""), contrib) for token, contrib in zip(token_list1, normalized_contributions)])

# 为每个 comment_list 的索引组指定颜色
color_map = {}
for i, indices in enumerate(comment_list):
    color = high_contrast_colors[i % len(high_contrast_colors)]  # 循环使用颜色
    for idx in indices:
        color_map[idx] = color

# 生成HTML带字体大小和颜色调整的输出
html_string = "<pre>"
buffer = ""
current_index = 0  # 追踪 code_snippet 中字符的索引位置

# 遍历原始代码片段的每个字符
for char in doc_snippet:
    if char == "\n":
        # 遇到换行符则添加 <br> 标签并清空缓冲区
        html_string += buffer + "<br>"
        buffer = ""
    elif tokens_with_contributions:
        # 获取当前 token 和其贡献值
        token, contrib = tokens_with_contributions[0]
        buffer += char

        # 逐字符匹配：检查 token 是否与 buffer 逐字符匹配
        if buffer == token:
            # 将贡献值映射到字体大小范围 [8, 16]
            font_size = 8 + (16 - 8) * contrib
            color = color_map.get(current_index, "black")  # 如果索引有颜色则应用，否则默认为黑色
            html_string += f'<span style="font-size: {font_size}px; color: {color};">{buffer}</span>'
            buffer = ""  # 清空缓冲区
            tokens_with_contributions.popleft()  # 移除已匹配的 token
            current_index += 1
        elif not token.startswith(buffer):
            # 如果缓冲区字符序列和当前 token 不匹配，将缓冲区第一个字符添加到 HTML 并继续逐字符匹配
            color = color_map.get(current_index, "black")
            html_string += f'<span style="color: {color};">{buffer[0]}</span>'
            buffer = buffer[1:]      
        
    else:
        # 如果没有更多 token 需要匹配，默认输出字符
        color = color_map.get(current_index, "black")
        html_string += f'<span style="color: {color};">{char}</span>'

html_string += buffer  # 添加剩余的缓冲区内容
html_string += "</pre>"

# 在Jupyter Notebook中显示带有字体大小和颜色调整的文本
display(HTML(html_string))

In [16]:
# 假设我们有原始代码片段和分词后的 tokens（包含缩进和换行）
code_snippet = remove_docstrings(train_dataset[code_ind]["code"])
token_list2 = code_tokens_data[code_ind][1:]

# 假设注意力特征数组 (示例)
code_attention_feature = ori_valid_code_attention_feature[code_ind][1:]  # 模拟的注意力分数

# 归一化注意力特征
code_normalized_contributions = (code_attention_feature - code_attention_feature.min()) / (code_attention_feature.max() - code_attention_feature.min())

# 将 token 与对应的贡献值配对，并去掉 token 中的 "Ġ" 符号
tokens_with_contributions = deque([(token.replace("Ġ", ""), contrib) for token, contrib in zip(token_list2, code_normalized_contributions)])

# 为每个 code_list 的索引组指定颜色
color_map = {}
for i, indices in enumerate(code_list):
    color = high_contrast_colors[i % len(high_contrast_colors)]  # 循环使用颜色
    for idx in indices:
        color_map[idx] = color

# 生成HTML带字体大小和颜色调整的输出
html_string = "<pre>"
buffer = ""
current_index = 0  # 追踪 code_snippet 中字符的索引位置

# 遍历原始代码片段的每个字符
for char in code_snippet:
    if char == "\n":
        # 遇到换行符则添加 <br> 标签并清空缓冲区
        html_string += buffer + "<br>"
        buffer = ""
    elif tokens_with_contributions:
        # 获取当前 token 和其贡献值
        token, contrib = tokens_with_contributions[0]
        buffer += char

        # 逐字符匹配：检查 token 是否与 buffer 逐字符匹配
        if buffer == token:
            # 将贡献值映射到字体大小范围 [8, 16]
            font_size = 8 + (16 - 8) * contrib
            color = color_map.get(current_index, "black")  # 如果索引有颜色则应用，否则默认为黑色
            html_string += f'<span style="font-size: {font_size}px; color: {color};">{buffer}</span>'
            buffer = ""  # 清空缓冲区
            tokens_with_contributions.popleft()  # 移除已匹配的 token
            current_index += 1
        elif not token.startswith(buffer):
            # 如果缓冲区字符序列和当前 token 不匹配，将缓冲区第一个字符添加到 HTML 并继续逐字符匹配
            color = color_map.get(current_index, "black")
            html_string += f'<span style="color: {color};">{buffer[0]}</span>'
            buffer = buffer[1:]      
        
    else:
        # 如果没有更多 token 需要匹配，默认输出字符
        color = color_map.get(current_index, "black")
        html_string += f'<span style="color: {color};">{char}</span>'

html_string += buffer  # 添加剩余的缓冲区内容
html_string += "</pre>"

# 在Jupyter Notebook中显示带有字体大小和颜色调整的文本
display(HTML(html_string))