In [None]:
import os
import pickle
import json
from transformers import AutoTokenizer
import numpy as np
import hanlp

In [None]:
def find_all_pkl_files(directories,exp,sample_type):
    if type(directories) == str:
        directories = [directories]
    for directory in directories:
        pkl_files = {exp: {sample: {} for sample in sample_type} for exp in exp}
        for root, dirs, files in os.walk(directory):
            for file in files:
                pkl_path = os.path.join(root, file)
                # 使用os.path的方法，确保跨平台兼容性
                filename = os.path.basename(pkl_path)
                user_id = f"user_{filename.split('_')[-1].split('.')[0]}"
                sample_type = filename.split('_')[0]
                experiment = filename.split('_')[1]
                if experiment in exp :
                    if sample_type in sample_type:
                        if user_id not in pkl_files[experiment][sample_type]:
                            pkl_files[experiment][sample_type][user_id] = []
                            pkl_files[experiment][sample_type][user_id].append(pkl_path)
    return pkl_files

def find_common_keys(A, B):
    # 创建一个空的字典用来存储相同的key
    common_keys = {}

    # 遍历A的第一层key
    for outer_key in A:
        # 确保B中也有相同的第一层key
        if outer_key in B:
            # 遍历A和B的第二层嵌套结构
            for inner_key in A[outer_key]:
                # 确保B中有相同的第二层key
                if inner_key in B[outer_key]:
                    # 获取A和B在第二层字典中的key集合
                    keys_in_A = set(A[outer_key][inner_key].keys())
                    keys_in_B = set(B[outer_key][inner_key].keys())
                    
                    # 计算两个集合的交集，找出相同的key
                    common = keys_in_A & keys_in_B
                    
                    # 如果有相同的key，将其记录在common_keys中
                    if common:
                        if outer_key not in common_keys:
                            common_keys[outer_key] = {}
                        common_keys[outer_key][inner_key] = common

    return common_keys

def find_common_keys_dicts(dict_list):
    # 创建一个空的字典用来存储相同的key
    common_keys = {}

    # 遍历第一个字典的第一层key
    for outer_key in dict_list[0]:
        # 确保每个字典都有相同的第一层key
        if all(outer_key in d for d in dict_list):
            # 遍历每个字典的第二层嵌套结构
            for inner_key in dict_list[0][outer_key]:
                # 确保每个字典中有相同的第二层key
                if all(inner_key in d[outer_key] for d in dict_list):
                    # 获取每个字典在第二层中的key集合
                    common = set(dict_list[0][outer_key][inner_key].keys())
                    
                    # 对比每个字典，找出共有的key
                    for d in dict_list[1:]:
                        common &= set(d[outer_key][inner_key].keys())
                    
                    # 如果有相同的key，将其记录在common_keys中
                    if common:
                        if outer_key not in common_keys:
                            common_keys[outer_key] = {}
                        common_keys[outer_key][inner_key] = common
    
    return common_keys

def generate_text_with_scores_html(tensor, text, output_path, normalize=True, method='min-max', window_size=10):
    """
    根据输入的向量和文本生成一个HTML文件，其中每个字符上方显示对应的分数，
    并根据分数调整颜色。最终的HTML文件保存在指定路径中。

    参数:
    tensor (numpy.ndarray): 形状为 (1, N) 的向量，其中 N 是文本的长度。
    text (str): 文本字符串，与向量长度匹配。
    output_path (str): 输出HTML文件的路径。
    normalize (bool): 是否归一化分数，默认为True。
    method (str): 归一化方法，支持 'min-max'（默认）、'mean' 和 'moving-average'。
    window_size (int): 滑动平均窗口大小，默认为10，仅在选择滑动平均归一化时使用。

    返回:
    None
    """
    
    # 检查张量和文本长度是否匹配
    assert tensor.shape[1] == len(text), "张量和文本的长度不匹配！"

    scores = tensor[0]

    # 归一化分数
    if normalize:
        if method == 'min-max':
            scores = (scores - scores.min()) / (scores.max() - scores.min())
        elif method == 'mean':
            mean_value = scores.mean()
            scores = scores - mean_value
        elif method == 'moving-average':
            mean_value = np.convolve(scores, np.ones(window_size) / window_size, mode='same')
            scores = scores - mean_value
        else:
            raise ValueError("未知的归一化方法。请使用 'min-max', 'mean', 或 'moving-average'。")

    # 创建颜色映射函数
    def score_to_color(score):
        r = int(255 * score)
        b = 255 - r
        return f'rgb({r}, 0, {b})'

    # 定义每行显示的字符数
    chars_per_line = 50
    num_lines = len(text) // chars_per_line + (1 if len(text) % chars_per_line else 0)

    # 生成HTML内容
    html_content = "<html><body style='font-family:monospace;'>\n"

    # 设置间隔
    spacing = "20px"  # 可以根据需要调整间隔

    for line in range(num_lines):
        start_idx = line * chars_per_line
        end_idx = start_idx + chars_per_line
        line_text = text[start_idx:end_idx]
        line_scores = scores[start_idx:end_idx]

        for i, char in enumerate(line_text):
            color = score_to_color(line_scores[i])
            score_text = f"{line_scores[i]:.2f}"
            border_style = "border: 1px solid black;" if line_scores[i] > 0 else ""
            html_content += f"<div style='display:inline-block; text-align:center; color:{color}; margin-right:{spacing}; {border_style}'>" \
                            f"<div style='font-size:0.5em;'>{score_text}</div>" \
                            f"<div>{char}</div>" \
                            f"</div>"

        html_content += "<br>\n"

    html_content += "</body></html>"

    # 保存为HTML文件
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"HTML文件已生成并保存在 {output_path}")

In [None]:
def decode_tokens_with_error_handling(tokenizer, token_ids):
    new_list = []  # 用于存储新的解码结果
    i = 0

    while i < len(token_ids):
        current_token = [token_ids[i]]  # 当前的 token
        decoded = tokenizer.decode(current_token)  # 尝试解码当前 token

        # 如果解码结果是 "�"，开始尝试联合后续 token 进行解码
        if "�" in decoded:
            combined_tokens = [token_ids[i]]  # 当前 token
            i += 1
            while i < len(token_ids):
                combined_tokens.append(token_ids[i])
                decoded = tokenizer.decode(combined_tokens)  # 尝试解码组合后的 tokens

                # 直到解码结果不再包含 "�"，则停止组合
                if "�" not in decoded:
                    new_list.append(combined_tokens)  # 将成功组合解码的 token 添加到新列表
                    break
                i += 1

            # 如果到最后仍然是 "�"，也添加组合的 token 列表
            if "�" in decoded:
                new_list.append(combined_tokens)
        else:
            # 如果解码成功，直接添加当前 token
            new_list.append(token_ids[i])

        i += 1

    return new_list

def decode_tokens_with_att_handling(tokenizer, token_ids, att):
    new_token_list = []  # 用于存储新的解码后的 token 列表
    new_att_list = []  # 用于存储新的 att 列表
    i = 0

    while i < len(token_ids):
        current_token = [token_ids[i]]  # 当前的 token
        decoded = tokenizer.decode(current_token)  # 尝试解码当前 token

        # 如果解码结果是 "�"，开始尝试联合后续 token 进行解码
        if "�" in decoded:
            combined_tokens = [token_ids[i]]  # 当前 token
            combined_att = [att[i]]  # 对应的 att
            i += 1
            while i < len(token_ids):
                combined_tokens.append(token_ids[i])
                combined_att.append(att[i])  # 合并 att 列表中的元素
                decoded = tokenizer.decode(combined_tokens)  # 尝试解码组合后的 tokens

                # 直到解码结果不再包含 "�"，则停止组合
                if "�" not in decoded:
                    new_token_list.append(combined_tokens)  # 将成功组合解码的 token 添加到新列表
                    new_att_list.append(combined_att)  # 将合并的 att 添加到新列表
                    break
                i += 1

            # 如果到最后仍然是 "�"，也添加组合的 token 列表
            if "�" in decoded:
                new_token_list.append(combined_tokens)
                new_att_list.append(combined_att)
        else:
            # 如果解码成功，直接添加当前 token 和对应的 att
            new_token_list.append(token_ids[i])
            new_att_list.append(att[i])

        i += 1

    return new_token_list, new_att_list
import re
import string
import unicodedata

def clean_up_tuples(tuple_list):
    processed_list = []
    removed_list = []
    # Define a regex pattern to match only Chinese characters
    chinese_pattern = re.compile(r'[\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF]+')
    
    for item in tuple_list:
        # Process the first element of the tuple
        cleaned_token = item[0].replace('\n', '').replace(' ', '').replace('️', '')

        # Remove all non-printing, special characters, punctuation, and symbols
        cleaned_token = ''.join(
            char for char in cleaned_token 
            if chinese_pattern.match(char)
        )
        
        # Check if the cleaned token is valid: must not be empty
        if cleaned_token:
            # Add the cleaned token with the original value to the processed list
            processed_list.append((cleaned_token, item[1]))
        else:
            removed_list.append((item[0], item[1]))

    return processed_list, removed_list


def clean_text_list(text_list):
    # Define a regex pattern to match only Chinese characters
    chinese_pattern = re.compile(r'[\u4E00-\u9FFF\u3400-\u4DBF\uF900-\uFAFF]+')
    
    # Define extended punctuation set including English and Chinese punctuation marks
    extended_punctuation = string.punctuation + '《》（）【】、；：。，！？「」『』“”‘’'
    punctuation_pattern = re.compile(f"[{re.escape(extended_punctuation)}]")
    
    cleaned_list = []
    for text in text_list:
        # Remove punctuation, spaces, newline characters, and special characters
        cleaned_text = punctuation_pattern.sub('', text.replace(' ', '').replace('\n', '').replace('️', ''))
        
        # Keep only Chinese characters
        cleaned_text = ''.join(chinese_pattern.findall(cleaned_text))
        
        # Add to cleaned list if the result is not empty
        if cleaned_text:
            cleaned_list.append(cleaned_text)
    
    return cleaned_list
import numpy as np

def match_and_average_complex(tk_at, l_new_token):
    l_new_token_attention = []
    index_tk_at = 0
    i = 0

    while i < len(l_new_token) and index_tk_at < len(tk_at):
        # 创建当前需要匹配的l_new_token token
        current_l_token = l_new_token[i]#有
        current_tk_token, current_value = tk_at[index_tk_at]
        current_tk_combined = current_tk_token#有什么
        current_values = [current_value]

        while True:
            if len(current_tk_combined) < len(current_l_token) and index_tk_at + 1 < len(tk_at):
                index_tk_at += 1
                current_tk_combined += tk_at[index_tk_at][0]
                current_values.append(tk_at[index_tk_at][1])
                #print('1: ',current_tk_combined,current_l_token)
            if len(current_tk_combined) > len(current_l_token) and i + 1 < len(l_new_token):#有什么 #有
                i += 1
                current_l_token += l_new_token[i]#有 什么样
                current_values.append(tk_at[index_tk_at][1])
                #print('2: ',current_tk_combined,current_l_token)
            if current_tk_combined == current_l_token: #两个匹配的完全一样的情况
                averaged_value = np.mean(current_values)
                tmp = (current_tk_combined,averaged_value)
                l_new_token_attention.append(tmp)
                index_tk_at += 1
                i += 1
                #print('3: ',current_tk_combined,current_l_token)
                break
    
    return l_new_token_attention

In [None]:
qwen   = find_all_pkl_files('E:\qwen\qwen',exps,sample_types)
llama = find_all_pkl_files('E:\llama\llama',exps,sample_types)
glm = find_all_pkl_files('E:\glm\glm',exps,sample_types)

In [None]:
common_users = find_common_keys_dicts([qwen,llama,glm])
HanLP = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH) 

                                             

In [None]:

models = {'qwen':qwen}#,'llama':llama,'glm':glm,'mind':mind}



for model in models :
    if model == 'qwen':
        tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
        qwen_result = {exp: {sample: {} for sample in sample_types} for exp in exps}
    elif model == 'llama':
        tokenizer = AutoTokenizer.from_pretrained("D:\LLMs\Llama3-8B-Chinese-Chat")
        llama_result = {exp: {sample: {} for sample in sample_types} for exp in exps}
    elif model == 'glm':
        tokenizer = AutoTokenizer.from_pretrained('D:\LLMs\glm-4-9b-chat',trust_remote_code=True)
        glm_result = {exp: {sample: {} for sample in sample_types} for exp in exps}
    elif model == 'mind':
        tokenizer = AutoTokenizer.from_pretrained('D:\LLMs\MindChat-Qwen-7B-v2',trust_remote_code=True)
        mind_result = {exp: {sample: {} for sample in sample_types} for exp in exps}
    elif model == 'mc':
        tokenizer = AutoTokenizer.from_pretrained('D:\LLMs\MeChat',trust_remote_code=True)
        mc_result = {exp: {sample: {} for sample in sample_types} for exp in exps}
    for experiment in common_users:
        for sample_type in common_users[experiment]:
            users = common_users[experiment][sample_type]
            for user in users:
                if model == 'qwen':
                    qwen_result[experiment][sample_type][user] = {}
                elif model == 'llama':
                    llama_result[experiment][sample_type][user] = {}
                elif model == 'glm':
                    glm_result[experiment][sample_type][user] = {}
                elif model == 'mind':
                    mind_result[experiment][sample_type][user] = {}
                elif model == 'mc':
                    mc_result[experiment][sample_type][user] = {}
                #print(model,experiment,sample_type,user)
                with open(models[model][experiment][sample_type][user][0], 'rb') as f:
                    data = pickle.load(f)
                if model == 'qwen':
                    joint_attentions = data['joint_attentions']
                    model_inputs = data['model_inputs'].input_ids[0]
                    self_joint_attentions = data['self_joint_attention']
                    prompt_drift = {'OR2-REV': 67,'OR2': 78}
                    start = 0
                    end = 0
                    for i,token in enumerate(model_inputs):
                        decoded = tokenizer.decode(token)
                        #print(i, decoded)
                        if decoded == 'system':
                            start = i+2
                        if decoded == tokenizer.eos_token:
                            end = i
                            break
                    start = start + prompt_drift[experiment]
                    #seq_tokens = [tokenizer.decode([token]) for token in model_inputs[q_start:q_end]]
                    all_generated_joint_attentions_avg_last = np.array([att[1][:,:,prompt_drift[experiment]:end] for att in joint_attentions]).mean(axis=0)[-1] #对生成步骤求平均，取最后一个，[1,input_len]
                    last_self_attention = self_joint_attentions[-1][prompt_drift[experiment]:].reshape(1,-1) #取最后一层的自注意力 [1,input_len]
                    
                    all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],all_generated_joint_attentions_avg_last[0].tolist())
                    last_self_attention_token, last_self_attention_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],last_self_attention.tolist()[0])
                    all_generated_joint_attentions_avg_last_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in all_generated_joint_attentions_avg_last_att
                        ]
                    #assert len(all_generated_joint_attentions_avg_last_token) == len(all_generated_joint_attentions_avg_last_att)
                    last_self_attention_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in last_self_attention_att
                        ]
                    assert len(all_generated_joint_attentions_avg_last_token) == len(last_self_attention_token)
                    assert all_generated_joint_attentions_avg_last_token == last_self_attention_token
                    joint_att_pairs = [(tokenizer.decode(token), att) for token, att in zip(all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att)]
                    self_att_pairs = [(tokenizer.decode(token), att) for token, att in zip(last_self_attention_token, last_self_attention_att)]
                    qwen_result[experiment][sample_type][user]['join_att_before_seg'] = joint_att_pairs
                    qwen_result[experiment][sample_type][user]['self_att_before_seg'] = self_att_pairs
                    l_new_token = HanLP(''.join([tokenizer.decode(token) for token in all_generated_joint_attentions_avg_last_token]))
                    l_new_token_cleaned = clean_text_list(l_new_token)
                    cleaned_joint_att_paris, removed_joint_att_paris = clean_up_tuples(joint_att_pairs)
                    cleaned_self_att_paris, removed_self_att_paris = clean_up_tuples(self_att_pairs)
                    assert len(cleaned_joint_att_paris) == len(cleaned_self_att_paris)
                    assert len(removed_joint_att_paris) == len(removed_self_att_paris)
                    seged_joint_att_pairs = match_and_average_complex(cleaned_joint_att_paris, l_new_token_cleaned)
                    seged_self_att_pairs = match_and_average_complex(cleaned_self_att_paris, l_new_token_cleaned)
                    qwen_result[experiment][sample_type][user]['join_att_after_seg'] = seged_joint_att_pairs
                    qwen_result[experiment][sample_type][user]['self_att_after_seg'] = seged_self_att_pairs
                    qwen_result[experiment][sample_type][user]['removed_joint_att_paris'] = removed_joint_att_paris
                    qwen_result[experiment][sample_type][user]['removed_self_att_paris'] = removed_self_att_paris
                elif model == 'llama':
                    joint_attentions = data['joint_attentions']
                    model_inputs = data['model_inputs'][0]
                    self_joint_attentions = data['self_joint_attention']
                    prompt_drift = {'OR2-REV': 84,'OR2': 91}
                    start = 0
                    end = 0
                    for i,token in enumerate(model_inputs):
                        decoded = tokenizer.decode(token)
                        #print(i, decoded)
                        if decoded == 'system':
                            start = i+3
                        if decoded == tokenizer.eos_token:
                            end = i
                            break
                    start = start + prompt_drift[experiment]
                    #seq_tokens = [tokenizer.decode([token]) for token in model_inputs[q_start:q_end]]
                    all_generated_joint_attentions_avg_last = np.array([att[1][:,:,prompt_drift[experiment]:end] for att in joint_attentions]).mean(axis=0)[-1] #对生成步骤求平均，取最后一个，[1,input_len]
                    last_self_attention = self_joint_attentions[-1][prompt_drift[experiment]:].reshape(1,-1)
                    #assert len(all_generated_joint_attentions_avg_last) == len(last_self_attention[0])
                    all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'][0][start:end],all_generated_joint_attentions_avg_last[0].tolist())
                    last_self_attention_token, last_self_attention_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'][0][start:end],last_self_attention.tolist()[0])
                    all_generated_joint_attentions_avg_last_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in all_generated_joint_attentions_avg_last_att
                        ]
                    #assert len(all_generated_joint_attentions_avg_last_token) == len(all_generated_joint_attentions_avg_last_att)
                    last_self_attention_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in last_self_attention_att
                        ]
                    assert len(all_generated_joint_attentions_avg_last_token) == len(last_self_attention_token)
                    assert all_generated_joint_attentions_avg_last_token == last_self_attention_token
                    joint_att_pairs = [(tokenizer.decode(token), att) for token, att in zip(all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att)]
                    self_att_pairs = [(tokenizer.decode(token), att) for token, att in zip(last_self_attention_token, last_self_attention_att)]
                    llama_result[experiment][sample_type][user]['join_att_before_seg'] = joint_att_pairs
                    llama_result[experiment][sample_type][user]['self_att_before_seg'] = self_att_pairs
                    l_new_token = HanLP(''.join([tokenizer.decode(token) for token in all_generated_joint_attentions_avg_last_token]))
                    l_new_token_cleaned = clean_text_list(l_new_token)
                    cleaned_joint_att_paris, removed_joint_att_paris = clean_up_tuples(joint_att_pairs)
                    cleaned_self_att_paris, removed_self_att_paris = clean_up_tuples(self_att_pairs)
                    assert len(cleaned_joint_att_paris) == len(cleaned_self_att_paris)
                    assert len(removed_joint_att_paris) == len(removed_self_att_paris)
                    seged_joint_att_pairs = match_and_average_complex(cleaned_joint_att_paris, l_new_token_cleaned)
                    seged_self_att_pairs = match_and_average_complex(cleaned_self_att_paris, l_new_token_cleaned)
                    llama_result[experiment][sample_type][user]['join_att_after_seg'] = seged_joint_att_pairs
                    llama_result[experiment][sample_type][user]['self_att_after_seg'] = seged_self_att_pairs
                    llama_result[experiment][sample_type][user]['removed_joint_att_paris'] = removed_joint_att_paris
                    llama_result[experiment][sample_type][user]['removed_self_att_paris'] = removed_self_att_paris
                elif model == 'glm':
                    glm_result[experiment][sample_type][user] = {}
                    tokenizer = AutoTokenizer.from_pretrained('D:\LLMs\glm-4-9b-chat',trust_remote_code=True)
                    joint_attentions = data['joint_attentions']
                    model_inputs = data['model_inputs'].input_ids[0]
                    self_joint_attentions = data['self_joint_attention']
                    prompt_drift = {'OR2': 74}
                    start = 0
                    end = 0
                    for i,token in enumerate(model_inputs):
                        decoded = tokenizer.decode([token])
                        #print(i, decoded)
                        if decoded == '<|system|>':
                            start = i+2
                        if decoded == '<|assistant|>':
                            end = i
                            break
                    start = start + prompt_drift[experiment]
                    #seq_tokens = [tokenizer.decode([token]) for token in model_inputs[q_start:q_end]]
                    all_generated_joint_attentions_avg_last = np.array([att[1][:,:,prompt_drift[experiment]:end] for att in joint_attentions]).mean(axis=0)[-1]
                    last_self_attention = self_joint_attentions[-1][prompt_drift[experiment]:].reshape(1,-1)
                    #assert len(all_generated_joint_attentions_avg_last) == len(last_self_attention[0])
                    all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],all_generated_joint_attentions_avg_last[0].tolist())
                    last_self_attention_token, last_self_attention_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],last_self_attention.tolist()[0])
                    all_generated_joint_attentions_avg_last_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in all_generated_joint_attentions_avg_last_att
                        ]
                    #assert len(all_generated_joint_attentions_avg_last_token) == len(all_generated_joint_attentions_avg_last_att)
                    #print(len(all_generated_joint_attentions_avg_last_token),len(all_generated_joint_attentions_avg_last_att))
                    last_self_attention_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in last_self_attention_att
                        ]
                    assert len(all_generated_joint_attentions_avg_last_token) == len(last_self_attention_token)
                    assert all_generated_joint_attentions_avg_last_token == last_self_attention_token
                    joint_att_pairs = [
                            (tokenizer.decode([token]) if not isinstance(token, list) else tokenizer.decode(token), att)
                            for token, att in zip(all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att)
                        ]
                    self_att_pairs = [
                            (tokenizer.decode([token]) if not isinstance(token, list) else tokenizer.decode(token), att)
                            for token, att in zip(last_self_attention_token, last_self_attention_att)
                        ]
                    glm_result[experiment][sample_type][user]['join_att_before_seg'] = joint_att_pairs
                    glm_result[experiment][sample_type][user]['self_att_before_seg'] = self_att_pairs
                    token_list = [
                            tokenizer.decode([token]) if not isinstance(token, list) else tokenizer.decode(token)
                            for token in all_generated_joint_attentions_avg_last_token
                        ]
                    l_new_token = HanLP(''.join(token_list))
                    l_new_token_cleaned = clean_text_list(l_new_token)
                    cleaned_joint_att_paris, removed_joint_att_paris = clean_up_tuples(joint_att_pairs)
                    cleaned_self_att_paris, removed_self_att_paris = clean_up_tuples(self_att_pairs)
                    assert len(cleaned_joint_att_paris) == len(cleaned_self_att_paris)
                    assert len(removed_joint_att_paris) == len(removed_self_att_paris)
                    seged_joint_att_pairs = match_and_average_complex(cleaned_joint_att_paris, l_new_token_cleaned)
                    seged_self_att_pairs = match_and_average_complex(cleaned_self_att_paris, l_new_token_cleaned)
                    glm_result[experiment][sample_type][user]['join_att_after_seg'] = seged_joint_att_pairs
                    glm_result[experiment][sample_type][user]['self_att_after_seg'] = seged_self_att_pairs
                    glm_result[experiment][sample_type][user]['removed_joint_att_paris'] = removed_joint_att_paris
                    glm_result[experiment][sample_type][user]['removed_self_att_paris'] = removed_self_att_paris
                elif model == 'mind':
                    joint_attentions = data['joint_attentions']
                    model_inputs = data['model_inputs'].input_ids[0]
                    self_joint_attentions = data['self_joint_attention']
                    prompt_drift = {'OR2': 34}
                    start = 0
                    end = 0
                    for i,token in enumerate(model_inputs):
                        decoded = tokenizer.decode(token)
                        #print(i, decoded)
                        if decoded == 'user' or decoded == 'system':
                            start = i+2
                        if decoded ==  '<|im_end|>':
                            end = i
                            break
                    start = start + prompt_drift[experiment]
                    #print(start,end)
                    #seq_tokens = [tokenizer.decode([token]) for token in model_inputs[q_start:q_end]]
                    all_generated_joint_attentions_avg_last = np.array([att[1][:,:,prompt_drift[experiment]:end] for att in joint_attentions]).mean(axis=0)[-1]
                    last_self_attention = self_joint_attentions[-1][prompt_drift[experiment]:].reshape(1,-1)
                    #assert len(all_generated_joint_attentions_avg_last) == len(last_self_attention[0])
                    all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],all_generated_joint_attentions_avg_last[0].tolist())
                    last_self_attention_token, last_self_attention_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],last_self_attention.tolist()[0])
                    all_generated_joint_attentions_avg_last_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in all_generated_joint_attentions_avg_last_att
                        ]
                    #assert len(all_generated_joint_attentions_avg_last_token) == len(all_generated_joint_attentions_avg_last_att)
                    last_self_attention_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in last_self_attention_att
                        ]
                    assert len(all_generated_joint_attentions_avg_last_token) == len(last_self_attention_token)
                    assert all_generated_joint_attentions_avg_last_token == last_self_attention_token
                    joint_att_pairs = [(tokenizer.decode(token), att) for token, att in zip(all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att)]
                    self_att_pairs = [(tokenizer.decode(token), att) for token, att in zip(last_self_attention_token, last_self_attention_att)]
                    mind_result[experiment][sample_type][user]['join_att_before_seg'] = joint_att_pairs
                    mind_result[experiment][sample_type][user]['self_att_before_seg'] = self_att_pairs
                    l_new_token = HanLP(''.join([tokenizer.decode(token) for token in all_generated_joint_attentions_avg_last_token]))
                    l_new_token_cleaned = clean_text_list(l_new_token)
                    cleaned_joint_att_paris, removed_joint_att_paris = clean_up_tuples(joint_att_pairs)
                    cleaned_self_att_paris, removed_self_att_paris = clean_up_tuples(self_att_pairs)
                    assert len(cleaned_joint_att_paris) == len(cleaned_self_att_paris)
                    assert len(removed_joint_att_paris) == len(removed_self_att_paris)
                    seged_joint_att_pairs = match_and_average_complex(cleaned_joint_att_paris, l_new_token_cleaned)
                    seged_self_att_pairs = match_and_average_complex(cleaned_self_att_paris, l_new_token_cleaned)
                    mind_result[experiment][sample_type][user]['join_att_after_seg'] = seged_joint_att_pairs
                    mind_result[experiment][sample_type][user]['self_att_after_seg'] = seged_self_att_pairs
                    mind_result[experiment][sample_type][user]['removed_joint_att_paris'] = removed_joint_att_paris
                    mind_result[experiment][sample_type][user]['removed_self_att_paris'] = removed_self_att_paris
                elif model == 'mc':
                    joint_attentions = data['joint_attentions']
                    model_inputs = data['model_inputs'].input_ids[0]
                    self_joint_attentions = data['self_joint_attention']
                    prompt_drift = {'OR2': 35}
                    start = 0
                    end = 0
                    for i,token in enumerate(model_inputs):
                        decoded = tokenizer.decode(token)
                        #print(i, decoded)
                        if decoded == '问' and start == 0:
                            start = i+2
                    end = len(data['model_inputs'].input_ids[0])-5
                    start = start + prompt_drift[experiment]
                    #print(start,end)
                    #seq_tokens = [tokenizer.decode([token]) for token in model_inputs[q_start:q_end]]
                    all_generated_joint_attentions_avg_last = np.array([att[1][:,:,prompt_drift[experiment]:end] for att in joint_attentions]).mean(axis=0)[-1]
                    last_self_attention = self_joint_attentions[-1][prompt_drift[experiment]:].reshape(1,-1)
                    #assert len(all_generated_joint_attentions_avg_last) == len(last_self_attention[0])
                    all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],all_generated_joint_attentions_avg_last[0].tolist())
                    last_self_attention_token, last_self_attention_att = decode_tokens_with_att_handling(tokenizer,data['model_inputs'].input_ids[0][start:end],last_self_attention.tolist()[0])
                    all_generated_joint_attentions_avg_last_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in all_generated_joint_attentions_avg_last_att
                        ]
                    #assert len(all_generated_joint_attentions_avg_last_token) == len(all_generated_joint_attentions_avg_last_att)
                    last_self_attention_att = [
                            sum(att) / len(att) if isinstance(att, list) else att
                            for att in last_self_attention_att
                        ]
                    assert len(all_generated_joint_attentions_avg_last_token) == len(last_self_attention_token)
                    assert all_generated_joint_attentions_avg_last_token == last_self_attention_token
                    joint_att_pairs = [
                            (tokenizer.decode([token]) if not isinstance(token, list) else tokenizer.decode(token), att)
                            for token, att in zip(all_generated_joint_attentions_avg_last_token, all_generated_joint_attentions_avg_last_att)
                        ]
                    self_att_pairs = [
                            (tokenizer.decode([token]) if not isinstance(token, list) else tokenizer.decode(token), att)
                            for token, att in zip(last_self_attention_token, last_self_attention_att)
                        ]
                    mc_result[experiment][sample_type][user]['join_att_before_seg'] = joint_att_pairs
                    mc_result[experiment][sample_type][user]['self_att_before_seg'] = self_att_pairs
                    token_list = [
                            tokenizer.decode([token]) if not isinstance(token, list) else tokenizer.decode(token)
                            for token in all_generated_joint_attentions_avg_last_token
                        ]
                    l_new_token = HanLP(''.join(token_list))
                    l_new_token_cleaned = clean_text_list(l_new_token)
                    cleaned_joint_att_paris, removed_joint_att_paris = clean_up_tuples(joint_att_pairs)
                    cleaned_self_att_paris, removed_self_att_paris = clean_up_tuples(self_att_pairs)
                    assert len(cleaned_joint_att_paris) == len(cleaned_self_att_paris)
                    assert len(removed_joint_att_paris) == len(removed_self_att_paris)
                    seged_joint_att_pairs = match_and_average_complex(cleaned_joint_att_paris, l_new_token_cleaned)
                    seged_self_att_pairs = match_and_average_complex(cleaned_self_att_paris, l_new_token_cleaned)
                    mc_result[experiment][sample_type][user]['join_att_after_seg'] = seged_joint_att_pairs
                    mc_result[experiment][sample_type][user]['self_att_after_seg'] = seged_self_att_pairs
                    mc_result[experiment][sample_type][user]['removed_joint_att_paris'] = removed_joint_att_paris
                    mc_result[experiment][sample_type][user]['removed_self_att_paris'] = removed_self_att_paris

                    



In [None]:
from scipy.stats import zscore
qwen_result_analysis = {}
for user in qwen_result['OR2']['positive']:
    join_att_after_seg = qwen_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = qwen_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    z_scores_joint_att = zscore (join_att)
    z_scores_self_att = zscore (self_att)
    z_scores = z_scores_joint_att + z_scores_self_att
    for token,z_score in zip(tokens,z_scores):
        if token not in qwen_result_analysis:
            qwen_result_analysis[token] = [z_score]
        else:
            qwen_result_analysis[token].append(z_score)
qwen_result_analysis = {k:(sum(v),len(v)) for k,v in qwen_result_analysis.items()}

    

In [None]:
from scipy.stats import zscore
glm_result_analysis = {}
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    z_scores_joint_att = zscore (join_att)
    z_scores_self_att = zscore (self_att)
    z_scores = z_scores_joint_att + z_scores_self_att
    for token,z_score in zip(tokens,z_scores):
        if token not in glm_result_analysis:
            glm_result_analysis[token] = [z_score]
        else:
            glm_result_analysis[token].append(z_score)
glm_result_analysis = {k:(sum(v),len(v)) for k,v in glm_result_analysis.items()}

glm_result_analysis_2 = {}
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    removed_joint_att_paris = glm_result['OR2']['positive'][user]['removed_joint_att_paris']
    removed_self_att_paris = glm_result['OR2']['positive'][user]['removed_self_att_paris']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    removed_self_att = [pair[1] for pair in removed_self_att_paris]
    removed_joint_att = [pair[1] for pair in removed_joint_att_paris]
    mean_self_att = np.mean(self_att+removed_self_att)
    mean_joint_att = np.mean(join_att+removed_joint_att)
    std_dev_self_att = np.std(self_att+removed_self_att)
    std_dev_joint_att = np.std(join_att+removed_joint_att) 
    z_scores_joint_att = [(att-mean_joint_att)/std_dev_joint_att for att in join_att]
    z_scores_self_att = [(att-mean_self_att)/std_dev_self_att for att in self_att]
    z_scores = z_scores_joint_att + z_scores_self_att
    for token,z_score in zip(tokens,z_scores):
        if token not in glm_result_analysis_2:
            glm_result_analysis_2[token] = [z_score]
        else:
            glm_result_analysis_2[token].append(z_score)
glm_result_analysis_2 = {k:(sum(v),len(v)) for k,v in glm_result_analysis_2.items()}

glm_result_analysis_3 = {}
all_tokens = {}
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    for token,att in zip(tokens,atts):
        if token not in all_tokens:
            all_tokens[token] = [att]
        else:
            all_tokens[token].append(att)
all_token_mean = np.mean([att for att_list in all_tokens.values() for att in att_list])
all_token_std = np.std([att for att_list in all_tokens.values() for att in att_list])
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    atts_zscore = [(att-all_token_mean)/all_token_std for  att in atts]
    for token,z_score in zip(tokens,atts_zscore):
        if token not in glm_result_analysis_3:
            glm_result_analysis_3[token] = [z_score]
        else:
            glm_result_analysis_3[token].append(z_score)
glm_result_analysis_3 = {k:sum(v) for k,v in glm_result_analysis_3.items()}

In [None]:
qwen_result_analysis_2 = {}
for user in qwen_result['OR2']['positive']:
    join_att_after_seg = qwen_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = qwen_result['OR2']['positive'][user]['self_att_after_seg']
    removed_joint_att_paris = qwen_result['OR2']['positive'][user]['removed_joint_att_paris']
    removed_self_att_paris = qwen_result['OR2']['positive'][user]['removed_self_att_paris']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    removed_self_att = [pair[1] for pair in removed_self_att_paris]
    removed_joint_att = [pair[1] for pair in removed_joint_att_paris]
    mean_self_att = np.mean(self_att+removed_self_att)
    mean_joint_att = np.mean(join_att+removed_joint_att)
    std_dev_self_att = np.std(self_att+removed_self_att)
    std_dev_joint_att = np.std(join_att+removed_joint_att) 
    z_scores_joint_att = [(att-mean_joint_att)/std_dev_joint_att for att in join_att]
    z_scores_self_att = [(att-mean_self_att)/std_dev_self_att for att in self_att]
    #z_scores = z_scores_joint_att + z_scores_self_att
    z_scores = [x + y for x, y in zip(z_scores_joint_att, z_scores_self_att)]
    for token,z_score in zip(tokens,z_scores):
        if token not in qwen_result_analysis_2:
            qwen_result_analysis_2[token] = [z_score]
        else:
            qwen_result_analysis_2[token].append(z_score)
qwen_result_analysis_2 = {k:(sum(v),len(v)) for k,v in qwen_result_analysis_2.items()}

In [None]:
qwen_result_analysis_3 = {}
all_tokens = {}
for user in qwen_result['OR2']['positive']:
    join_att_after_seg = qwen_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = qwen_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    for token,att in zip(tokens,atts):
        if token not in all_tokens:
            all_tokens[token] = [att]
        else:
            all_tokens[token].append(att)
all_token_mean = np.mean([att for att_list in all_tokens.values() for att in att_list])
all_token_std = np.std([att for att_list in all_tokens.values() for att in att_list])
for user in qwen_result['OR2']['positive']:
    join_att_after_seg = qwen_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = qwen_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    atts_zscore = [(att-all_token_mean)/all_token_std for  att in atts]
    for token,z_score in zip(tokens,atts_zscore):
        if token not in qwen_result_analysis_3:
            qwen_result_analysis_3[token] = [z_score]
        else:
            qwen_result_analysis_3[token].append(z_score)
qwen_result_analysis_3 = {k:sum(v) for k,v in qwen_result_analysis_3.items()}


In [None]:
from scipy.stats import zscore
llama_result_analysis = {}
for user in llama_result['OR2']['positive']:
    join_att_after_seg = llama_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = llama_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    z_scores_joint_att = zscore (join_att)
    z_scores_self_att = zscore (self_att)
    z_scores = z_scores_joint_att + z_scores_self_att
    for token,z_score in zip(tokens,z_scores):
        if token not in llama_result_analysis:
            llama_result_analysis[token] = [z_score]
        else:
            llama_result_analysis[token].append(z_score)
llama_result_analysis = {k:(sum(v),len(v)) for k,v in llama_result_analysis.items()}

In [None]:
llama_result_analysis_2 = {}
for user in llama_result['OR2']['positive']:
    join_att_after_seg = llama_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = llama_result['OR2']['positive'][user]['self_att_after_seg']
    removed_joint_att_paris = llama_result['OR2']['positive'][user]['removed_joint_att_paris']
    removed_self_att_paris = llama_result['OR2']['positive'][user]['removed_self_att_paris']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    removed_self_att = [pair[1] for pair in removed_self_att_paris]
    removed_joint_att = [pair[1] for pair in removed_joint_att_paris]
    mean_self_att = np.mean(self_att+removed_self_att)
    mean_joint_att = np.mean(join_att+removed_joint_att)
    std_dev_self_att = np.std(self_att+removed_self_att)
    std_dev_joint_att = np.std(join_att+removed_joint_att) 
    z_scores_joint_att = [(att-mean_joint_att)/std_dev_joint_att for att in join_att]
    z_scores_self_att = [(att-mean_self_att)/std_dev_self_att for att in self_att]
    z_scores = z_scores_joint_att + z_scores_self_att
    for token,z_score in zip(tokens,z_scores):
        if token not in llama_result_analysis_2:
            llama_result_analysis_2[token] = [z_score]
        else:
            llama_result_analysis_2[token].append(z_score)
llama_result_analysis_2 = {k:(sum(v),len(v)) for k,v in llama_result_analysis_2.items()}

In [None]:
llama_result_analysis_3 = {}
all_tokens = {}
for user in llama_result['OR2']['positive']:
    join_att_after_seg = llama_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = llama_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    for token,att in zip(tokens,atts):
        if token not in all_tokens:
            all_tokens[token] = [att]
        else:
            all_tokens[token].append(att)
all_token_mean = np.mean([att for att_list in all_tokens.values() for att in att_list])
all_token_std = np.std([att for att_list in all_tokens.values() for att in att_list])
for user in llama_result['OR2']['positive']:
    join_att_after_seg = llama_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = llama_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    atts_zscore = [(att-all_token_mean)/all_token_std for  att in atts]
    for token,z_score in zip(tokens,atts_zscore):
        if token not in llama_result_analysis_3:
            llama_result_analysis_3[token] = [z_score]
        else:
            llama_result_analysis_3[token].append(z_score)
llama_result_analysis_3 = {k:sum(v) for k,v in llama_result_analysis_3.items()}

In [None]:
from scipy.stats import zscore
glm_result_analysis = {}
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    z_scores_joint_att = zscore (join_att)
    z_scores_self_att = zscore (self_att)
    z_scores = z_scores_joint_att + z_scores_self_att
    for token,z_score in zip(tokens,z_scores):
        if token not in glm_result_analysis:
            glm_result_analysis[token] = [z_score]
        else:
            glm_result_analysis[token].append(z_score)
glm_result_analysis = {k:(sum(v),len(v)) for k,v in glm_result_analysis.items()}

glm_result_analysis_2 = {}
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    removed_joint_att_paris = glm_result['OR2']['positive'][user]['removed_joint_att_paris']
    removed_self_att_paris = glm_result['OR2']['positive'][user]['removed_self_att_paris']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    removed_self_att = [pair[1] for pair in removed_self_att_paris]
    removed_joint_att = [pair[1] for pair in removed_joint_att_paris]
    mean_self_att = np.mean(self_att+removed_self_att)
    mean_joint_att = np.mean(join_att+removed_joint_att)
    std_dev_self_att = np.std(self_att+removed_self_att)
    std_dev_joint_att = np.std(join_att+removed_joint_att) 
    z_scores_joint_att = [(att-mean_joint_att)/std_dev_joint_att for att in join_att]
    z_scores_self_att = [(att-mean_self_att)/std_dev_self_att for att in self_att]
    z_scores = z_scores_joint_att + z_scores_self_att
    for token,z_score in zip(tokens,z_scores):
        if token not in glm_result_analysis_2:
            glm_result_analysis_2[token] = [z_score]
        else:
            glm_result_analysis_2[token].append(z_score)
glm_result_analysis_2 = {k:(sum(v),len(v)) for k,v in glm_result_analysis_2.items()}

glm_result_analysis_3 = {}
all_tokens = {}
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    for token,att in zip(tokens,atts):
        if token not in all_tokens:
            all_tokens[token] = [att]
        else:
            all_tokens[token].append(att)
all_token_mean = np.mean([att for att_list in all_tokens.values() for att in att_list])
all_token_std = np.std([att for att_list in all_tokens.values() for att in att_list])
for user in glm_result['OR2']['positive']:
    join_att_after_seg = glm_result['OR2']['positive'][user]['join_att_after_seg']
    self_att_after_seg = glm_result['OR2']['positive'][user]['self_att_after_seg']
    assert [pair[0] for pair in join_att_after_seg] == [pair[0] for pair in self_att_after_seg]
    tokens = [pair[0] for pair in join_att_after_seg]
    self_att = [pair[1] for pair in self_att_after_seg]
    join_att = [pair[1] for pair in join_att_after_seg]
    atts = [x + y for x, y in zip(join_att, self_att)]
    atts_zscore = [(att-all_token_mean)/all_token_std for  att in atts]
    for token,z_score in zip(tokens,atts_zscore):
        if token not in glm_result_analysis_3:
            glm_result_analysis_3[token] = [z_score]
        else:
            glm_result_analysis_3[token].append(z_score)
glm_result_analysis_3 = {k:sum(v) for k,v in glm_result_analysis_3.items()}