In [27]:
import json
import pandas as pd
import torch
import matplotlib.pyplot as plt
import ast
import os
import random
from tqdm.notebook import tqdm
import sys
from pandas import DataFrame
import re

from logical_utils.ast_tools import *
from logical_utils.attention_models import *
from logical_utils.eval import *
from logical_utils.probability_prediction import *
from logical_utils.data_process_tools import *
from logical_utils.io_utils import *


random.seed(1)

In [5]:
def remove_comments(code):
    # Remove multi-line comments
    code_no_comments = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
    
    # Remove single-line comments
    code_no_comments = re.sub(r'#.*', '', code_no_comments)
    
    return code_no_comments

In [28]:
# 初始化一个空的 DataFrame
python_df = pd.DataFrame()

# JSONL 文件所在的目录
directory_path = 'data/py'

# 遍历目录中的所有文件
for filename in os.listdir(directory_path):
    if filename.endswith(".jsonl"):
        print("--------------------------")
        # print(filename)

        file_path = os.path.join(directory_path, filename)     
        # 逐行读取每个 JSONL 文件
        with open(file_path, 'r') as file:
            data = []
            for line in file:
                example = json.loads(line)
                try:
                    san_check, label = logical_consistency_check(example["code"])
                    if if_logic_check(example["code"]) and san_check:
                        exp = {
                            "label": label,
                            "code": example["code"]
                        }
                        data.append(exp)
                except SyntaxError:
                    continue

            # 将收集的数据创建为 DataFrame
            df = pd.DataFrame(data)

            # 删除重复的列
            df = df.loc[:, ~df.T.duplicated()]
            print(f'File {filename}: {len(df)} entries')

            # 将当前 DataFrame 拼接到总的 DataFrame 中
            python_df = pd.concat([python_df, df], ignore_index=True)

# 最终的 DataFrame
print(python_df)

--------------------------
File python_train_13.jsonl: 26 entries
--------------------------
File python_train_11.jsonl: 35 entries
--------------------------
File python_train_8.jsonl: 32 entries
--------------------------
File python_test_0.jsonl: 20 entries
--------------------------
File python_train_10.jsonl: 23 entries
--------------------------
File python_train_12.jsonl: 35 entries
--------------------------
File python_train_9.jsonl: 22 entries
--------------------------
File python_valid_0.jsonl: 22 entries
--------------------------
File python_train_7.jsonl: 25 entries
--------------------------
File python_train_5.jsonl: 24 entries
--------------------------
File python_train_1.jsonl: 28 entries
--------------------------
File python_train_3.jsonl: 21 entries
--------------------------
File python_train_4.jsonl: 29 entries
--------------------------
File python_train_6.jsonl: 37 entries
--------------------------
File python_train_2.jsonl: 41 entries
----------------------

In [29]:
len(python_df)

441

In [30]:
python_df

Unnamed: 0,label,code
0,remove,"def decorate_method(cls, func):\n """"""\n..."
1,append,"def main():\n """"""Run playbook""""""\n for f..."
2,append,"def set_substitution(self, word, substitution)..."
3,append,"def call_fn(self, what, *args, **kwargs):\n ..."
4,remove,"def get_mentions(self, message):\n """""" ..."
...,...,...
436,remove,"def remove(self, tab_index):\r\n """"""Rem..."
437,append,"def _add_pret_words(self, pret_embeddings):\n ..."
438,append,"def check_cache(self, template):\n '''\..."
439,append,"def add(self, key):\n """"""\n Add ..."


In [31]:
python_df.to_json("syn_data/python_examples.json")

In [32]:
python_df = pd.read_json("syn_data/python_examples.json")

In [33]:
class IfNodeExtractor(ast.NodeVisitor):
    def __init__(self):
        self.if_nodes = []

    def visit_If(self, node):
        # This ensures we visit all If nodes and their children
        self.generic_visit(node)
        self.if_nodes.append(node)  # Append the node to the list

    def get_if_nodes(self, node):
        self.visit(node)
        return self.if_nodes

In [10]:
code = "if x in themes:\n        themes.remove(x)\n"
prompt = "if x in themes:\n        themes."

In [11]:
code = "if x==True:\n        x=False\n"

In [12]:
class IfStatementChecker(ast.NodeVisitor):
    def __init__(self):
        self.found_match = False

    def visit_If(self, node):
        # 检查 if 语句的条件是否为 "x == True"
        if (isinstance(node.test, ast.Compare) and
                isinstance(node.test.left, ast.Name) and
                len(node.test.ops) == 1 and
                isinstance(node.test.ops[0], ast.Eq) and
                len(node.test.comparators) == 1 and
                isinstance(node.test.comparators[0], ast.Constant) and
                node.test.comparators[0].value is True):
            
            # 获取条件中的变量名
            variable_name = node.test.left.id
            
            # 检查 if 语句的主体是否为 "x = False"
            if isinstance(node.body[0], ast.Assign):
                assign_node = node.body[0]
                if (isinstance(assign_node.targets[0], ast.Name) and
                        assign_node.targets[0].id == variable_name and
                        isinstance(assign_node.value, ast.Constant) and
                        assign_node.value.value is False):
                    self.found_match = True

        # 继续遍历 AST
        self.generic_visit(node)

# 示例代码
code = """
if g_toContinue == True:    # at the end of last message fragment
    addJavaMessages(tempMessage, messageType, java_messages, java_message_types)
    tempMessage = ""
    messageType = ""

# start of new message fragment
g_toContinue = False
"""

# 解析代码为 AST
parsed_code = ast.parse(code)

# 创建检查器实例并访问 AST
checker = IfStatementChecker()
checker.visit(parsed_code)

# 输出检查结果
if checker.found_match:
    print("Found an 'if x == True: x = False' pattern.")
else:
    print("No match found.")

No match found.


In [13]:
import ast

class IfStatementChecker(ast.NodeVisitor):
    def __init__(self):
        self.found_match = False

    def visit_If(self, node):
        # 检查 if 语句的条件是否为 "x == True"
        if (isinstance(node.test, ast.Compare) and
                isinstance(node.test.left, ast.Name) and
                node.test.left.id == 'x' and
                len(node.test.ops) == 1 and
                isinstance(node.test.ops[0], ast.Eq) and
                len(node.test.comparators) == 1 and
                isinstance(node.test.comparators[0], ast.Constant) and
                node.test.comparators[0].value is True):
            
            # 检查 if 语句的主体是否为 "x = False"
            if len(node.body) == 1 and isinstance(node.body[0], ast.Assign):
                assign_node = node.body[0]
                if (isinstance(assign_node.targets[0], ast.Name) and
                        assign_node.targets[0].id == 'x' and
                        isinstance(assign_node.value, ast.Constant) and
                        assign_node.value.value is False):
                    self.found_match = True

        # 继续遍历 AST
        self.generic_visit(node)

# 示例代码
code = """
x = True
if x == True:
    x = False
"""

# 解析代码为 AST
parsed_code = ast.parse(code)

# 创建检查器实例并访问 AST
checker = IfStatementChecker()
checker.visit(parsed_code)

# 输出检查结果
if checker.found_match:
    print("Found an 'if x == True: x = False' pattern.")
else:
    print("No match found.")

Found an 'if x == True: x = False' pattern.


In [15]:
class bool_LogicalChecker(ast.NodeVisitor):
    def __init__(self):
        self.logical_consistency = False
        self.label = ""

    def visit_If(self, node):
        # True - False
        # 检查 if 语句条件是否为 "x == True"
        if isinstance(node.test, ast.Compare) and 
            isinstance(node.test.ops[0], ast.Eq) and
            isinstance(node.test.comparators[0], ast.Constant) and
            node.test.comparators[0].value == True):
            left_value = node.test.left.id
                
                # 检查 if 语句体是否为 "x == False"
                for expr in node.body:
                    if (isinstance(expr.targets.id, ast) and
                        isinstance(assign_node.value, ast.Constant) and
                        assign_node.value.value is False):
                        
                    if (isinstance(expr, ast.Expr) and
                        isinstance(expr.value, ast.Call) and
                        isinstance(expr.value.func, ast.Attribute) and
                        expr.value.func.attr == 'remove'):
                        
                    if (isinstance(expr, ast.Compare) and
                        isinstance(expr.left, ast.Name) and
                        isinstance(expr.ops[0], ast.Eq) and
                        isinstance(expr.comparators[0], ast.Constant) and
                        expr.comparators[0].value == False):
                        
                        # 记录匹配的代码片段
                        self.matches.append(node)
                        
        if isinstance(node.test, ast.Compare) and any(isinstance(op, ast.In) for op in node.test.ops):

            left_value = node.test.left
            if isinstance(left_value, ast.Constant):
                left_var = left_value.value
            elif isinstance(left_value, ast.Name):
                left_var = left_value.id
            else:
                left_var = None
    
            for expr in node.body:
                if (isinstance(expr, ast.Expr) and
                        isinstance(expr.value, ast.Call) and
                        isinstance(expr.value.func, ast.Attribute) and
                        expr.value.func.attr == 'remove'):
                    arg_value = expr.value.args[0]
                    if isinstance(arg_value, ast.Constant):
                        arg_var = arg_value.value
                    elif isinstance(arg_value, ast.Name):
                        arg_var = arg_value.id
                    else:
                        arg_var = None
        
                    if left_var == arg_var:
                        self.logical_consistency = True
                        self.label = "remove"

        # not in - append
        elif isinstance(node.test, ast.Compare) and any(isinstance(op, ast.NotIn) for op in node.test.ops):
            left_value = node.test.left
            if isinstance(left_value, ast.Constant):
                left_var = left_value.value
            elif isinstance(left_value, ast.Name):
                left_var = left_value.id
            else:
                left_var = None
                
            for expr in node.body:
                if (isinstance(expr, ast.Expr) and
                        isinstance(expr.value, ast.Call) and
                        isinstance(expr.value.func, ast.Attribute) and
                        expr.value.func.attr == 'append'):
                    
                    arg_value = expr.value.args[0]
                    if isinstance(arg_value, ast.Constant):
                        arg_var = arg_value.value
                    elif isinstance(arg_value, ast.Name):
                        arg_var = arg_value.id
                    else:
                        arg_var = None
                        
                    if left_var == arg_var:
                        self.logical_consistency = True
                        self.label = "append"

        else:
            self.logical_consistency = False

        return self.generic_visit(node)

    def san_check(self, node):
        # the name of this func is just for fun
        self.visit(node)
        return self.logical_consistency, self.label

SyntaxError: unmatched ')' (4182668178.py, line 12)

In [16]:
 def visit_If(self, node):
        
        
        # 继续遍历子节点
        self.generic_visit(node)


In [None]:
code = "if x in themes:\n        themes.remove(x)\n"

In [None]:
code = "if x==True:\n        x=False\n"

In [None]:
code = "def get_themes(templates_path):\n    \"\"\"Returns available themes list.\"\"\"\n    themes = os.listdir(templates_path)\n    if '__common__' in themes:\n        themes.remove('__common__')\n    return themes"

In [None]:
code = "if 1 in themes:\n        themes.remove(1)\n"

In [None]:
parsed = ast.parse(code)
parsed_str = ast.dump(parsed, indent=4)
print(parsed_str)

In [None]:
parsed = ast.parse(code)
parsed_str = ast.dump(parsed, indent=4)
print(parsed_str)

In [17]:
# initial datasets
df_LI_ori_full =pd.DataFrame(columns=('label','msk','msk_intervention'))
df_LI_ori_trct =pd.DataFrame(columns=('label','msk','msk_intervention'))

df_LI_ori_full_no_comments =pd.DataFrame(columns=('label','msk','msk_intervention'))
df_LI_ori_trct_no_comments =pd.DataFrame(columns=('label','msk','msk_intervention'))

df_LI_pure_full =pd.DataFrame(columns=('label','msk','msk_intervention'))
df_LI_pure_trct =pd.DataFrame(columns=('label','msk','msk_intervention'))

In [6]:
from logical_utils.ast_tools import *

In [7]:
label_counter = [0,0]
candidates = ["remove", "append"]

In [36]:
for index, row in python_df.iterrows():
    # label
    label = row['label']
    df_LI_ori_full.at[index, 'label'] = label
    df_LI_ori_trct.at[index, 'label'] = label
    df_LI_ori_full_no_comments.at[index, 'label'] = label
    df_LI_ori_trct_no_comments.at[index, 'label'] = label
    df_LI_pure_full.at[index, 'label'] = label
    df_LI_pure_trct.at[index, 'label'] = label

    if label == candidates[0]:
        label_counter[0] += 1
    elif label == candidates[1]:
        label_counter[1] += 1

    # code
    code = row["code"]
    code_tree = ast.parse(code, mode='exec')
    '''
    print("==========")
    print(ast.unparse(code_tree))    
    '''
    
    # Mask keyword full code script
    process = KeywordMasker()
    masked_tree = process.processor(code_tree, "masker")
    code_tree = ast.parse(code, mode='exec')
    inversed_tree = process.processor(code_tree, "inverser")
    '''
    print("-------------")
    print(ast.unparse(masked_tree))
    print("\n")
    print(ast.unparse(inversed_tree))
    print("\n")
    '''
    df_LI_ori_full.at[index, 'msk'] = ast.unparse(masked_tree)
    df_LI_ori_full.at[index, 'msk_intervention'] = ast.unparse(inversed_tree)
    
    df_LI_ori_trct.at[index, 'msk'] = ast.unparse(masked_tree).split("$$$", maxsplit=1)[0]
    df_LI_ori_trct.at[index, 'msk_intervention'] = ast.unparse(inversed_tree).split("$$$", maxsplit=1)[0]

    # Mask keyword full code script with no comments
    code_string = remove_comments(ast.unparse(masked_tree))
    inversed_code_string = remove_comments(ast.unparse(inversed_tree))
    
    df_LI_ori_full_no_comments.at[index, 'msk'] =code_string
    df_LI_ori_full_no_comments.at[index, 'msk_intervention'] = inversed_code_string
    
    df_LI_ori_trct_no_comments.at[index, 'msk'] = code_string.split("$$$", maxsplit=1)[0]
    df_LI_ori_trct_no_comments.at[index, 'msk_intervention'] = inversed_code_string.split("$$$", maxsplit=1)[0]
    '''
    print("++++++++++++++++++++++++")
    print(code_string)
    print("\n")
    print(inversed_code_string)
    '''
    # Pure logical code snippet
    if_extractor = IfNodeExtractor()
    code_tree_pure = if_extractor.get_if_nodes(masked_tree)
    if_extractor = IfNodeExtractor()
    inversed_tree_pure = if_extractor.get_if_nodes(inversed_tree)

    df_LI_pure_full.at[index, 'msk'] = ast.unparse(code_tree_pure)
    df_LI_pure_full.at[index, 'msk_intervention'] = ast.unparse(inversed_tree_pure)
    
    df_LI_pure_trct.at[index, 'msk'] = ast.unparse(code_tree_pure).split("$$$", maxsplit=1)[0]
    df_LI_pure_trct.at[index, 'msk_intervention'] = ast.unparse(inversed_tree_pure).split("$$$", maxsplit=1)[0]

    print("############")
    print(ast.unparse(code_tree_pure))
    print("\n")
    print(ast.unparse(inversed_tree_pure))

############
if api_call in call_queue:
    call_queue.$$$(api_call)


if api_call not in call_queue:
    call_queue.$$$(api_call)
############
if flag not in sys.argv:
    sys.argv.$$$(flag)


if flag in sys.argv:
    sys.argv.$$$(flag)
############
if sub_group not in self._substitutions:
    self._log.info('Appending new word substitution: "{word}" => "{sub}"'.format(word=word, sub=substitution))
    self._substitutions.$$$(sub_group)


if sub_group in self._substitutions:
    self._log.info('Appending new word substitution: "{word}" => "{sub}"'.format(word=word, sub=substitution))
    self._substitutions.$$$(sub_group)
############
if port not in self['_initialized_ports']:
    self._call_fn(port, 'init_adapter')
    self['_initialized_ports'].$$$(port)


if port in self['_initialized_ports']:
    self._call_fn(port, 'init_adapter')
    self['_initialized_ports'].$$$(port)
############
if m in mentions_set:
    mentions_set.$$$(m)
    deduped_mentions.append(m)


if m not in mentio

In [37]:
# Save data
df_LI_ori_full.to_json(f"syn_data/py_full/LI_ori_full.json", orient='records', lines=True)
df_LI_ori_trct.to_json(f"syn_data/py_full/LI_ori_trct.json", orient='records', lines=True)

df_LI_ori_full_no_comments.to_json(f"syn_data/py_full/LI_ori_full_no_comments.json", orient='records', lines=True)
df_LI_ori_trct_no_comments.to_json(f"syn_data/py_full/LI_ori_trct_no_comments.json", orient='records', lines=True)

df_LI_pure_full.to_json(f"syn_data/py_full/LI_pure_full.json", orient='records', lines=True)
df_LI_pure_trct.to_json(f"syn_data/py_full/LI_pure_trct.json", orient='records', lines=True)

In [19]:
# load data
df_LI_ori_full = pd.read_json("syn_data/py_full/LI_ori_full.json", lines=True)
df_LI_ori_trct = pd.read_json("syn_data/py_full/LI_ori_trct.json", lines=True)
df_LI_ori_full_no_comments = pd.read_json("syn_data/py_full/LI_ori_full_no_comments.json", lines=True)
df_LI_ori_trct_no_comments = pd.read_json("syn_data/py_full/LI_ori_trct_no_comments.json", lines=True)
df_LI_pure_full = pd.read_json("syn_data/py_full/LI_pure_full.json", lines=True)
df_LI_pure_trct = pd.read_json("syn_data/py_full/LI_pure_trct.json", lines=True)

datasets = {
    "df_LI_ori_full": df_LI_ori_full,
    "df_LI_ori_full_no_comments": df_LI_ori_full_no_comments,
    "df_LI_pure_full": df_LI_pure_full
}

datasets_trct = {
    "df_LI_ori_trct": df_LI_ori_trct,
    "df_LI_ori_trct_no_comments": df_LI_ori_trct_no_comments,
    "df_LI_pure_trct": df_LI_pure_trct
}

In [8]:
def find_max(d):
    v_max = 0
    k_max = ""
    for k in d:
        if d[k] > v_max:
            v_max = d[k]
            k_max = k
    return k_max, v_max

In [219]:
class CodeLogical:
    def __init__(self, data, tokenizer, model, model_name, candidates, flag):
        self.original_string = data["msk"]
        self.intervention_string = data["msk_intervention"]

        self.label = data["label"]
        
        # print(f'# print(self.label) : {self.label}')
        self.tokenizer = tokenizer
        self.model = model
        self.model_name = model_name
        self.candidates = candidates

        self.flag = flag

        if self.flag == "predict_method":
            self.original_input = self.tokenizer(self.original_string, return_tensors="pt")
            self.intervention_input = self.tokenizer(self.intervention_string, return_tensors="pt")
    
            self.original_prob, self.original_logits, self.original_fwp, self.original_fwi = self.get_prediction_probabilities(self.original_input)
            self.intervention_prob, self.intervention_logits, self.intervention_fwp, self.intervention_fwi = self.get_prediction_probabilities(self.intervention_input)
            
        elif self.flag == "placeholder_method":
            prompt = ""
            # prompt = "Predict $$$ in:"
            # double check if $$$ can work. or is mentioned somewhere.
            # Potential alternative placeholders (from Eugene):
            #   FIXME
            #   TODO
            #   XXX
            if self.model_name == 1:
            # codegemma
                self.original_string = self.original_string.replace('$$$', '<|fim_suffix|>')
                self.intervention_string = self.intervention_string.replace('$$$', '<|fim_suffix|>')
                self.original_string = "<|fim_prefix|>" + self.original_string + "<|fim_middle|>"
                self.intervention_string = "<|fim_prefix|>" + self.intervention_string + "<|fim_middle|>"
                #print(self.original_string)
            
            self.original_input = self.tokenizer(self.original_string, return_tensors="pt")
            self.intervention_input = self.tokenizer(self.intervention_string, return_tensors="pt")
    
            self.original_prob, self.original_logits, self.original_fwp, self.original_fwi = self.get_prediction_probabilities(self.original_input)
            self.intervention_prob, self.intervention_logits, self.intervention_fwp, self.intervention_fwi = self.get_prediction_probabilities(self.intervention_input)

    def calculate_effect(self):
        for c in self.original_prob:
            # print(c)
            # print(self.label)
            if c == self.label:
                self.original_odds = calculate_log_odds(self.original_prob[c])
                # print(f'# print(self.original_odds) : {self.original_odds}')
                self.intervention_odds = calculate_log_odds(self.intervention_prob[c])
                # print(f'# print(self.intervention_odds) : {self.intervention_odds}')
                self.odds_r = calculate_log_odds_r(self.original_prob[c], self.intervention_prob[c])

                #self.total_effect = total_effect(self.original_odds, self.intervention_odds)
                # print(f'# print(self.total_effect) : {self.total_effect}')

    def get_prediction_probabilities(self, inputs):

        with torch.no_grad():
            self.outputs = self.model(**inputs, labels=inputs["input_ids"])

        # obtain the logits of the last token
        logits = self.outputs.logits[:, -1, :]

        # using softmax to calculate prob distribution
        probs = torch.softmax(logits, dim=-1)

        # probs of each candidate words
        candidate_tokens = self.tokenizer.convert_tokens_to_ids(self.candidates)
        candidate_probs = probs[:, candidate_tokens].tolist()[0]
        candidate_logits = logits[:, candidate_tokens].tolist()[0]
        # save the other possible tokens which have a greater probability than the max of candidate in intervention.

        word_probs = dict(zip(self.candidates, candidate_probs))
        word_logits = dict(zip(self.candidates, candidate_logits))

        k, v = find_max(word_probs)
        
        threshold = v
        mask = probs > threshold
        filtered_probs = probs[mask]
        filtered_indices = torch.nonzero(mask, as_tuple=True)[1]
        filtered_words = tokenizer.convert_ids_to_tokens(filtered_indices)

        if self.model_name == 0:
            filtered_words = [word.lstrip('Ġ') for word in filtered_words]

        # print("Filtered Probs:", filtered_probs)  # 调试打印
        #print("Filtered Indices:", filtered_indices)  # 调试打印
        #print("Filtered Words:", filtered_words)  # 调试打印

        filtered_words_probs = dict(zip(filtered_words, filtered_probs.tolist()))
        filtered_words_indices = dict(zip(filtered_words, filtered_indices.tolist()))
        
        
        return word_probs, word_logits, filtered_words_probs, filtered_words_indices


    def code_to_dict(self):
        code_dict = {
            "original_string": self.original_string,
            "intervention_string": self.intervention_string,
            "model_name": self.model_name,
            "label": self.label,
            "original_ids": self.original_input["input_ids"].tolist(),
            "intervention_ids": self.intervention_input["input_ids"].tolist(),

            # "original_input" : self.original_input,
            # "intervention_input" : self.intervention_input,

            "original_prob": self.original_prob,
            "intervention_prob": self.intervention_prob,
            "original_logits": self.original_logits,
            "intervention_logits": self.intervention_logits,
            "original_odds": self.original_odds,
            "intervention_odds": self.intervention_odds,
            "odds_r": self.odds_r,
            "original_fwp": self.original_fwp,
            "original_fwi": self.original_fwi,
            "intervention_fwp": self.intervention_fwp,
            "intervention_fwi": self.intervention_fwi
            #"total_effect": self.total_effect
        }

        return code_dict


In [220]:
from logical_utils.attention_models import *

In [221]:
for name, dataset in datasets.items():
    code_logical_dict_list = []
    with open(f'result/{name}_result.json', 'w') as file:
        for model_name in [1]:
            model, tokenizer, model_n = get_model_tokenizer(model_name)
            print(model_n)
        
            # initial mean values
            sum_remove_total_effect = 0
            sum_append_total_effect = 0
            
            for index, row in tqdm(dataset.iterrows(), desc='Progress', total=len(dataset)):
                #print(row)
                if row["msk"].count('$$$') != 1:
                    continue
                code_logical = CodeLogical(row, tokenizer, model, model_name, candidates, "placeholder_method")
                code_logical.calculate_effect()
                
                code_logical_dict = code_logical.code_to_dict()
                #print(code_logical_dict)
                
                code_logical_dict_list.append(code_logical_dict)
        json.dump(code_logical_dict_list, file, indent=4)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

codegemma


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

codegemma


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

codegemma


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

In [41]:
for name, dataset in datasets_trct.items():
    code_logical_dict_list = []
    with open(f'result/{name}_result.json', 'w') as file:
        for model_name in [0, 1, 2]:
            model, tokenizer, model_n = get_model_tokenizer(model_name)
            print(model_n)
        
            # initial mean values
            sum_remove_total_effect = 0
            sum_append_total_effect = 0
            
            for index, row in tqdm(dataset.iterrows(), desc='Progress', total=len(dataset)):
                #print(row)
                code_logical = CodeLogical(row, tokenizer, model, model_name, candidates, "predict_method")
                code_logical.calculate_effect()
                
                code_logical_dict = code_logical.code_to_dict()
                code_logical_dict_list.append(code_logical_dict)
        json.dump(code_logical_dict_list, file, indent=4)
# Filtered Probs should have a number for the specific example

gpt2


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

codegemma


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

codegen


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

gpt2


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

codegemma


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

codegen


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

gpt2


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

codegemma


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

codegen


Progress:   0%|          | 0/441 [00:00<?, ?it/s]

# Top-k

# Result Analysis

## Mean Value

In [29]:
import math
import numpy as np
import csv

In [30]:
def json2dict(PATH):
    with open(PATH, "r") as f:
        result = json.load(f)
    return result

In [28]:
df_LI_ori_full_result = json2dict("result/df_LI_ori_full_result.json")
df_LI_ori_trct_result = json2dict("result/df_LI_ori_trct_result.json")

df_LI_ori_full_no_comments_result = json2dict("result/df_LI_ori_full_no_comments_result.json")
df_LI_ori_trct_no_comments_result = json2dict("result/df_LI_ori_trct_no_comments_result.json")

df_LI_pure_full_result = json2dict("result/df_LI_pure_full_result.json")
df_LI_pure_trct_result = json2dict("result/df_LI_pure_trct_result.json")

trct_results_alys = {
    "df_LI_ori_trct_result": df_LI_ori_trct_result,
    "df_LI_ori_trct_no_comments_result": df_LI_ori_trct_no_comments_result,
    "df_LI_pure_trct_result": df_LI_pure_trct_result,
}

full_results_alys = {
    "df_LI_ori_full_result": df_LI_ori_full_result,
    "df_LI_ori_full_no_comments_result": df_LI_ori_full_no_comments_result,
    "df_LI_pure_full_result": df_LI_pure_full_result,
}

In [24]:
for dataset in [datasets, datasets_trct]:
    for k in dataset:
        r = 0
        a = 0
        for index, row in datasets[k].iterrows():
            if row["label"] == "remove":
                r+=1
            elif row["label"] == "append":
                a+=1
        res = str(k) + ",remove=" + str(r) + ",append=" + str(a) + "\n"
        print(res)

df_LI_ori_full,remove=140,append=301

df_LI_ori_full_no_comments,remove=140,append=301

df_LI_pure_full,remove=140,append=301



KeyError: 'df_LI_ori_trct'

In [72]:
def extract_original_prob(data, model, label):
    # print(data)
    original_remove_probs = []
    original_append_probs = []

    intervention_remove_probs = []
    intervention_append_probs = []

    original_odds = []
    intervention_odds = []

    odds_r = []
    oddr_l = []
    n = 0
    original_fwp = 0
    intervention_fwp = 0
    
    for item in data:
        # print(item)
        #print(model)
        #print(item['model_name'])
        #print(item['label'])
        #print(label)
        if item['model_name'] == model and item['label'] == label:
            n+=1
            #print(f"iiiii - {i}")
            #print(f"item['model_name'] - {item['model_name']}")
            #print(f"item['label'] - {item['label']}")
            original_remove_probs.append(item['original_prob']['remove'])
            original_append_probs.append(item['original_prob']['append'])
            
            intervention_remove_probs.append(item['intervention_prob']['remove'])
            intervention_append_probs.append(item['intervention_prob']['append'])

            original_odds.append(item['original_odds'])
            intervention_odds.append(item['intervention_odds'])
            #print("@@@@@@@@@@")
            #print(item['odds_r'])
            odds_r.append(item['odds_r'])

            if item['label'] == "remove":
                ori_odds = calculate_log_odds(item['original_prob']['append'])
                # print(f"ori_odds:{ori_odds}")
                int_odds = calculate_log_odds(item['intervention_prob']['append'])
                # print(f"int_odds:{int_odds}")
                oddr = calculate_log_odds_r(ori_odds, int_odds)
                # print(f"oddr:{oddr}")
            elif label == "append":
                ori_odds = calculate_log_odds(item['original_prob']['remove'])
                int_odds = calculate_log_odds(item['intervention_prob']['remove'])
                oddr = calculate_log_odds_r(ori_odds, int_odds)
            else:
                print("err")
                oddr = 0

            oddr_l.append(oddr)

            
            original_fwp += len(item["original_fwp"])
            intervention_fwp += len(item["intervention_fwp"])
            if item['model_name'] != 0:
                if len(item["original_fwp"]) > 0:
                    print(f'original_fwp: {item["original_fwp"]}')
                if len(item["intervention_fwp"]) > 0:
                    print(f'intervention_fwp: {item["intervention_fwp"]}')
                    
    original_fwp = original_fwp/n
    intervention_fwp = intervention_fwp/n
    
    original_prob = [np.array(original_remove_probs), np.array(original_append_probs)]
    intervention_prob = [np.array(intervention_remove_probs), np.array(intervention_append_probs)]
    
    original_odds = np.array(original_odds)
    intervention_odds = np.array(intervention_odds)
    
    odds_r = np.array(odds_r)
    oddr_l = np.array(oddr_l)
    #print(odds_r)

    return original_prob, intervention_prob, original_odds, intervention_odds, odds_r, oddr_l, original_fwp, intervention_fwp

In [63]:
import numpy as np

# 示例：在两个条件下的二维概率分布矩阵
P_C1 = np.array([[0.2, 0.1], [0.15, 0.05]])  # 在条件 C1 下的二维概率
P_C2 = np.array([[0.1, 0.2], [0.05, 0.15]])  # 在条件 C2 下的二维概率

# 计算每个 (X, Y) 对的 Total Effect
total_effect = P_C1 / P_C2

# 输出结果
print("Total Effect for each (X, Y):\n", total_effect)


Total Effect for each (X, Y):
 [[2.         0.5       ]
 [3.         0.33333333]]


In [44]:
if c == self.label:
    self.original_odds = calculate_log_odds(self.original_prob[c])
    # print(f'# print(self.original_odds) : {self.original_odds}')
    self.intervention_odds = calculate_log_odds(self.intervention_prob[c])
    # print(f'# print(self.intervention_odds) : {self.intervention_odds}')
    self.odds_r = calculate_log_odds_r(self.original_prob[c], self.intervention_prob[c])

NameError: name 'c' is not defined

In [10]:
def calculate_total_effect(condition_1_correct, condition_1_incorrect, condition_2_correct, condition_2_incorrect):
    """
    计算两个条件下的 Total Effect。
    
    参数:
    - condition_1_correct: 条件 1 下正确操作的概率
    - condition_1_incorrect: 条件 1 下错误操作的概率
    - condition_2_correct: 条件 2 下正确操作的概率
    - condition_2_incorrect: 条件 2 下错误操作的概率
    
    返回:
    - TE_1: 条件 1 下的 Total Effect
    - TE_2: 条件 2 下的 Total Effect
    """
    TE_1 = condition_1_correct / condition_1_incorrect
    TE_2 = condition_2_correct / condition_2_incorrect
    return TE_1, TE_2

# 示例数据
condition_1_correct = 0.7  # Condition 1 (if x in L) 正确预测概率
condition_1_incorrect = 0.3  # Condition 1 错误预测概率
condition_2_correct = 0.8  # Condition 2 (if x not in L) 正确预测概率
condition_2_incorrect = 0.2  # Condition 2 错误预测概率

# 计算 Total Effect
TE_1, TE_2 = calculate_total_effect(condition_1_correct, condition_1_incorrect, condition_2_correct, condition_2_incorrect)

# 输出结果
print(f"Total Effect under Condition 1: {TE_1:.2f}")
print(f"Total Effect under Condition 2: {TE_2:.2f}")


Total Effect under Condition 1: 2.33
Total Effect under Condition 2: 4.00


In [36]:
def cohen_d(x1, x2):
    mean1, mean2 = np.mean(x1), np.mean(x2)
    s1, s2 = np.std(x1, ddof=1), np.std(x2, ddof=1)
    n1, n2 = len(x1), len(x2)
    
    pooled_std = np.sqrt(((n1 - 1) * s1 ** 2 + (n2 - 1) * s2 ** 2) / (n1 + n2 - 2))
    d = (mean1 - mean2) / pooled_std
    
    return d

In [37]:
def calculate_log_odds(probability):
    return np.log(probability / (1 - probability))

def calculate_log_odds_r(odds_0, odds_1):
    return odds_1 - odds_0

In [74]:
candidates = ["remove", "append"]

for res in trct_results_alys:
    print(res)
    output_file = f'result/alys/{res}.csv'
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        writer.writerow(['Model', 'Label', 'Candidate', 'Mean Oddr_label', 'Mean Oddr', 
                         'Mean Difference', 'Std Dev Original', 'Std Dev Intervention', 
                         'Std Dev Difference', 'Logical Cohen D', 'original_fwp', 'intervention_fwp'])
        
        for model in [0, 1, 2, 3, 4]:
            print(f'++++++++ model={model} ++++++++')
            for label in candidates:
                print(f'======== label={label} ========')
                original_prob, intervention_prob, original_odds, intervention_odds, odds_r, oddr_l, original_fwp, intervention_fwp = extract_original_prob(trct_results_alys[res], model, label)
                
                #print(f'model: {model}, dataset: {res}, original_fwp: {original_fwp:.4f}, intervention_fwp: {intervention_fwp:.4f}')

                for i in range(len(candidates)):
                    print(f'------- c={candidates[i]} ---------')
                    # print(odds_r)
                    mean_odds_r = np.mean(odds_r)
                    mean_oddr = np.mean(oddr_l)
                    # print(mean_oddr)
                    mean_difference = np.mean(intervention_prob[i] - original_prob[i])
                    # print(intervention_prob[i])
                    std_deviation_original = np.std(original_odds, ddof=1)
                    std_deviation_intervention = np.std(intervention_odds, ddof=1) 
                    std_deviation_difference = std_deviation_intervention - std_deviation_original
                    logical_cohen_d = cohen_d(original_prob[i], intervention_prob[i])

                    
                    # print(f'{i}: mean_difference: {mean_difference:.4f}')
                    # print(f'{i}: std_deviation_original: {std_deviation_original:.4f}')
                    # print(f'{i}: std_deviation_intervention: {std_deviation_intervention:.4f}')
                    # print(f'{i}: std_deviation_difference: {std_deviation_difference:.4f}')
                    # print(f'logical_cohen_d: {logical_cohen_d:.4f}')
                    
                    writer.writerow([model, label, candidates[i], 
                                     f'{mean_odds_r:.4f}', 
                                     f'{mean_oddr:.4f}',
                                     f'{mean_difference:.4f}', 
                                     f'{std_deviation_original:.4f}', 
                                     f'{std_deviation_intervention:.4f}', 
                                     f'{std_deviation_difference:.4f}', 
                                     f'{logical_cohen_d:.4f}', 
                                     f'{original_fwp:.4f}', 
                                     f'{intervention_fwp:.4f}'])


df_LI_ori_trct_result
++++++++ model=0 ++++++++
------- c=remove ---------
------- c=append ---------
------- c=remove ---------
------- c=append ---------
++++++++ model=1 ++++++++
intervention_fwp: {'add': 0.9885977506637573, 'update': 0.004247460048645735}
intervention_fwp: {'insert': 0.7259172201156616}
intervention_fwp: {'add': 0.9337781071662903}
intervention_fwp: {'add': 0.9425973892211914, 'update': 0.02277393639087677}
intervention_fwp: {'insert': 0.5937519073486328}
intervention_fwp: {'insert': 0.520871102809906}
intervention_fwp: {'add': 0.4911496937274933}
intervention_fwp: {'add': 0.2946743667125702, 'discard': 0.4104529321193695}
intervention_fwp: {'insert': 0.7765082716941833}
original_fwp: {'pop': 0.562346875667572}
intervention_fwp: {'pop': 0.5792185068130493}
intervention_fwp: {'add': 0.6968184113502502}
intervention_fwp: {'add': 0.5917900800704956}
intervention_fwp: {'add': 0.9574002623558044}
intervention_fwp: {'insert': 0.8084030747413635}
intervention_fwp: {'add':

In [76]:
candidates = ["remove", "append"]

for res in full_results_alys:
    print(res)
    output_file = f'result/alys/{res}.csv'
    with open(output_file, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        writer.writerow(['Model', 'Label', 'Candidate', 'Mean Oddr_label', 'Mean Oddr', 'Mean Difference', 
                         'Std Dev Original', 'Std Dev Intervention', 'Std Dev Difference', 
                         'Logical Cohen D', 'original_fwp', 'intervention_fwp'])
        
        for model in [1, 4]:
            print(f'++++++++ model={model} ++++++++')
            for label in candidates:
                print(f'======== label={label} ========')
                original_prob, intervention_prob, original_odds, intervention_odds, odds_r, oddr_l, original_fwp, intervention_fwp = extract_original_prob(full_results_alys[res], model, label)
                # print(original_prob)
                
                for i in range(len(candidates)):
                    print(f'------- c={candidates[i]} ---------')
                    # print(odds_r)
                    mean_odds_r = np.mean(odds_r)
                    mean_oddr = np.mean(oddr_l)
                    mean_difference = np.mean(intervention_prob[i] - original_prob[i])
                    # print(intervention_prob[i])
                    std_deviation_original = np.std(original_odds, ddof=1)
                    std_deviation_intervention = np.std(intervention_odds, ddof=1) 
                    std_deviation_difference = std_deviation_intervention - std_deviation_original
                    logical_cohen_d = cohen_d(original_prob[i], intervention_prob[i])
                    
                    # print(f'{i}: mean_difference: {mean_difference:.4f}')
                    # print(f'{i}: std_deviation_original: {std_deviation_original:.4f}')
                    # print(f'{i}: std_deviation_intervention: {std_deviation_intervention:.4f}')
                    # print(f'{i}: std_deviation_difference: {std_deviation_difference:.4f}')
                    # print(f'logical_cohen_d: {logical_cohen_d:.4f}')
                    
                    writer.writerow([model, label, candidates[i], 
                                     f'{mean_odds_r:.4f}', 
                                     f'{mean_oddr:.4f}',
                                     f'{mean_difference:.4f}', 
                                     f'{std_deviation_original:.4f}', 
                                     f'{std_deviation_intervention:.4f}', 
                                     f'{std_deviation_difference:.4f}', 
                                     f'{logical_cohen_d:.4f}', 
                                     f'{original_fwp:.4f}', 
                                     f'{intervention_fwp:.4f}'])


df_LI_ori_full_result
++++++++ model=1 ++++++++
intervention_fwp: {'add': 0.9861438274383545}
intervention_fwp: {'add': 0.3997058868408203}
intervention_fwp: {'add': 0.8797391057014465}
intervention_fwp: {'add': 0.9806388020515442, 'update': 0.010261326096951962}
intervention_fwp: {'add': 0.8170274496078491}
intervention_fwp: {'add': 0.6259328722953796}
intervention_fwp: {'add': 0.9494974613189697}
intervention_fwp: {'add': 0.39489635825157166}
intervention_fwp: {'insert': 0.37495097517967224}
original_fwp: {'cancel': 0.16407868266105652}
intervention_fwp: {'cancel': 0.15853558480739594}
intervention_fwp: {'add': 0.41234755516052246}
intervention_fwp: {'insert': 0.6947551369667053}
intervention_fwp: {'add': 0.8029362559318542}
intervention_fwp: {'add': 0.8797781467437744}
intervention_fwp: {'insert': 0.5921135544776917}
------- c=remove ---------
------- c=append ---------
original_fwp: {'add': 0.7041010856628418}
original_fwp: {'add': 0.9688224196434021}
intervention_fwp: {'add': 0.98

## P-Value

In [142]:
from scipy import stats 
from scipy.stats import norm

In [143]:
def calculate_log_odds(probability):
    return math.log(probability / (1 - probability))

In [144]:
def p_value(data1, data2):
    # Ensure the data is one-dimensional by flattening the arrays
    data1 = np.diff(data1).flatten()
    data2 = np.diff(data2).flatten()
    
    print(np.array2string(data1, precision=4))
    print(np.array2string(data2, precision=4))

    # Perform KS test
    ks_statistic, p_value = stats.ks_2samp(data1, data2)

    # Set significance level alpha
    alpha = 0.05

    # Output the results
    print(f"KS statistic: {ks_statistic:.4f}")
    print(f"p-value: {p_value:.4f}")

    if p_value < alpha:
        print("There is a significant difference in the distribution between the two datasets")
    else:
        print("There is no significant difference in the distribution between the two datasets")

In [145]:
def p_value_oddr(data1, data2):
    # Ensure the data is one-dimensional by flattening the arrays
    data1 = data1.flatten()
    data2 = data2.flatten()
    
    #print(np.array2string(data1, precision=4))
    #print(np.array2string(data2, precision=4))

    # Perform KS test
    ks_statistic, p_value = stats.ks_2samp(data1, data2)

    # Set significance level alpha
    alpha = 0.05

    # Output the results
    print(f"KS statistic: {ks_statistic:.4f}")
    print(f"p-value: {p_value:.4f}")

    if p_value < alpha:
        print("There is a significant difference in the distribution between the two datasets")
    else:
        print("There is no significant difference in the distribution between the two datasets")

In [146]:
def extract_prob(data, model):
    p = []
    if model == 4:
        
        for item in data:
            prob = []
            if item['label'] == "remove":
                prob.append(calculate_log_odds(item['original_prob']['remove']))
                prob.append(calculate_log_odds(item['intervention_prob']['remove']))
            
            elif item['label'] == "append":
                #print(type(item['original_prob']['remove']))
                prob.append(calculate_log_odds(item['original_prob']['append']))
                prob.append(calculate_log_odds(item['intervention_prob']['append']))
            
            prob = np.array(prob)
            #print(prob)
            p.append(prob)
        p = np.array(p)

    else: 
        for item in data:
            prob = []
            if item['model_name'] == model:
                if item['label'] == "remove":
                    prob.append(calculate_log_odds(item['original_prob']['remove']))
                    prob.append(calculate_log_odds(item['intervention_prob']['remove']))
              
                elif item['label'] == "append":
                    prob.append(calculate_log_odds(item['original_prob']['append']))
                    prob.append(calculate_log_odds(item['intervention_prob']['append']))
                
                prob = np.array(prob)
                p.append(prob)
        
        p = np.array(p)
        
    # print(np.array2string(p, precision=4))
    return p

In [147]:
def extract_oddr(data, model):
    p = []
    for item in data:
        if item['model_name'] == model:
            p.append(item['odds_r'])
    p = np.array(p)      
    # print(np.array2string(p, precision=4))
    return p

In [None]:
df_LI_ori_full_result
df_LI_ori_trct_result
df_LI_ori_full_no_comments_result
df_LI_ori_trct_no_comments_result
df_LI_pure_full_result
df_LI_pure_trct_result

In [149]:
p_0 = extract_oddr(df_LI_ori_trct_no_comments_result, 2)

p_1 = extract_oddr(df_LI_pure_trct_result, 2)

p_value_oddr(p_0, p_1)

KS statistic: 0.0771
p-value: 0.1454
There is no significant difference in the distribution between the two datasets


In [None]:
p_0 = extract_prob(merged_file, 1)

p_1 = extract_prob(merged_file, 4)

p_value(p_0, p_1)

In [None]:

p_0 = extract_prob(logical_result, 3)

p_1 = extract_prob(cl_logical_result, 4)

p_value(p_0, p_1)

# Backup Blocks

In [None]:
data = []

# Read the JSONL file line by line
file_path = 'data/CodeSearchNet/python_test_0.jsonl'
with open(file_path, 'r') as file:
    for line in file:
        example = json.loads(line)
        #print(example["code"])
        try:
            san_check, label = logical_consistency_check(example["code"])
            if if_logic_check(example["code"]) and san_check:
                exp = dict()
                exp["label"] = label
                exp["code"] = example["code"]
                #print(example["code"])
                #print("############################")
                data.append(exp)
        except SyntaxError:
            continue

df = pd.DataFrame(data)
df = df.loc[:, ~df.T.duplicated()]
print(len(df))
# Display the DataFrame
#

In [224]:
# Read the two JSON files
with open('result/df_LI_ori_full_no_comments_result.json', 'r') as f1, open('result/df_LI_ori_full_no_comments_result4.json', 'r') as f2:
    data1 = json.load(f1)
    data2 = json.load(f2)

# Assuming both JSON files have the same structure, merge their contents
# If the structure is a list, extend the list
if isinstance(data1, list) and isinstance(data2, list):
    data1.extend(data2)
# If the structure is a dictionary, update the dictionary
elif isinstance(data1, dict) and isinstance(data2, dict):
    data1.update(data2)

# Save the merged data to a new file
with open('result/df_LI_ori_full_no_comments_result.json', 'w') as f_out:
    json.dump(data1, f_out, indent=4)

print("JSON files have been successfully merged!")

JSON files have been successfully merged!


In [71]:
import re
from collections import Counter

# Replace this string with the actual content from your dataset
data = """
df_LI_ori_trct_result
++++++++ model=0 ++++++++
======== label=remove ========
model: 0, dataset: df_LI_ori_trct_result, original_fwp: 0.0621, intervention_fwp: 0.0467
------- c=remove ---------
------- c=append ---------
======== label=append ========
model: 0, dataset: df_LI_ori_trct_result, original_fwp: 0.1111, intervention_fwp: 0.1456
------- c=remove ---------
------- c=append ---------
++++++++ model=1 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9885977506637573, 'update': 0.004247460048645735}
intervention_fwp: {'insert': 0.7259172201156616}
intervention_fwp: {'add': 0.9337781071662903}
intervention_fwp: {'add': 0.9425973892211914, 'update': 0.02277393639087677}
intervention_fwp: {'insert': 0.5937519073486328}
intervention_fwp: {'insert': 0.520871102809906}
intervention_fwp: {'add': 0.4911496937274933}
intervention_fwp: {'add': 0.2946743667125702, 'discard': 0.4104529321193695}
intervention_fwp: {'insert': 0.7765082716941833}
original_fwp: {'pop': 0.562346875667572}
intervention_fwp: {'pop': 0.5792185068130493}
intervention_fwp: {'add': 0.6968184113502502}
intervention_fwp: {'add': 0.5917900800704956}
intervention_fwp: {'add': 0.9574002623558044}
intervention_fwp: {'insert': 0.8084030747413635}
intervention_fwp: {'add': 0.46077197790145874}
original_fwp: {'movable': 0.2772509455680847}
intervention_fwp: {'cancel': 0.09122718870639801, 'movable': 0.11140184849500656}
intervention_fwp: {'insert': 0.5286495685577393}
original_fwp: {'pop': 0.46981751918792725}
intervention_fwp: {'pop': 0.47088149189949036}
original_fwp: {'add': 0.503273606300354}
intervention_fwp: {'add': 0.7262123227119446, 'update': 0.08800885826349258, 'discard': 0.09735377132892609}
original_fwp: {'pop': 0.6205573678016663}
intervention_fwp: {'insert': 0.4445091187953949}
intervention_fwp: {'add': 0.6121832132339478}
intervention_fwp: {'add': 0.9683995246887207}
intervention_fwp: {'add': 0.08641311526298523, 'clear': 0.08511625975370407, 'discard': 0.5732493996620178}
intervention_fwp: {'insert': 0.5567502975463867}
model: 1, dataset: df_LI_ori_trct_result, original_fwp: 0.0023, intervention_fwp: 0.0145
------- c=remove ---------
------- c=append ---------
======== label=append ========
intervention_fwp: {'add': 0.7797855734825134}
original_fwp: {'extend': 0.5640265941619873}
original_fwp: {'add': 0.12974831461906433}
intervention_fwp: {'add': 0.11046813428401947, 'update': 0.11326395720243454}
original_fwp: {'add': 0.774141788482666}
original_fwp: {'add': 0.49932777881622314}
intervention_fwp: {'get': 0.10282234847545624, 'const': 0.08346079289913177, 'update': 0.15112105011940002, 'pop': 0.398102343082428}
intervention_fwp: {'pop': 0.4879702627658844}
original_fwp: {'add': 0.587414562702179}
intervention_fwp: {'extend': 0.6968972086906433}
original_fwp: {'update': 0.492929071187973}
intervention_fwp: {'update': 0.4919142723083496, 'pop': 0.32830867171287537}
intervention_fwp: {'pop': 0.7521874904632568}
intervention_fwp: {'index': 0.3237031400203705}
original_fwp: {'add': 0.5649189949035645}
intervention_fwp: {'get': 0.44158753752708435}
original_fwp: {'insert': 0.5369459390640259}
original_fwp: {'put': 0.60103440284729}
intervention_fwp: {'put': 0.4950496256351471}
model: 1, dataset: df_LI_ori_trct_result, original_fwp: 0.0041, intervention_fwp: 0.0068
------- c=remove ---------
------- c=append ---------
++++++++ model=2 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9954816102981567, 'update': 0.0025999490171670914}
intervention_fwp: {'add': 0.9686837196350098, 'update': 0.020064854994416237}
original_fwp: {'insert': 0.5117871761322021}
intervention_fwp: {'insert': 0.5443623065948486}
intervention_fwp: {'add': 0.9415323734283447, 'update': 0.03569142520427704}
intervention_fwp: {'insert': 0.7332772016525269}
intervention_fwp: {'add': 0.8058857917785645}
intervention_fwp: {'insert': 0.8830679059028625}
original_fwp: {'pop': 0.5736905336380005}
intervention_fwp: {'add': 0.1208975538611412, 'pop': 0.45209065079689026}
intervention_fwp: {'add': 0.9676408171653748}
intervention_fwp: {'add': 0.9043576121330261}
original_fwp: {'pop': 0.4756520986557007}
intervention_fwp: {'insert': 0.7384058237075806}
intervention_fwp: {'add': 0.4273505210876465}
original_fwp: {'m': 0.3931162357330322}
intervention_fwp: {'m': 0.39034977555274963}
intervention_fwp: {'insert': 0.7648029327392578}
intervention_fwp: {'add': 0.4361956715583801}
intervention_fwp: {'pop': 0.5973809957504272}
original_fwp: {'add': 0.5451529026031494, 'update': 0.30902206897735596}
intervention_fwp: {'add': 0.8920137882232666, 'update': 0.09194901585578918}
original_fwp: {'pop': 0.7147256731987}
intervention_fwp: {'pop': 0.5478392839431763}
intervention_fwp: {'insert': 0.4399441182613373}
intervention_fwp: {'add': 0.3647574782371521}
intervention_fwp: {'add': 0.9850239753723145}
intervention_fwp: {'disc': 0.8341965675354004}
intervention_fwp: {'add': 0.5201635956764221}
intervention_fwp: {'insert': 0.6119810342788696}
model: 2, dataset: df_LI_ori_trct_result, original_fwp: 0.0032, intervention_fwp: 0.0132
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.6163228154182434}
intervention_fwp: {'add': 0.7300710082054138}
original_fwp: {'add': 0.7554648518562317}
original_fwp: {'ext': 0.6377891898155212}
original_fwp: {'set': 0.06277170777320862, 'add': 0.19336965680122375, 'init': 0.04282814636826515, 'initial': 0.07850123941898346}
intervention_fwp: {'set': 0.05886644124984741, 'add': 0.20475517213344574, 'initial': 0.048087190836668015}
original_fwp: {'add': 0.5432972311973572}
intervention_fwp: {'pop': 0.26845332980155945}
intervention_fwp: {'pop': 0.499235063791275}
original_fwp: {'add': 0.10113874077796936, 'create': 0.12242551147937775, 'update': 0.47208088636398315}
intervention_fwp: {'pop': 0.4163692891597748, 'update': 0.10086539387702942}
intervention_fwp: {'pop': 0.5427217483520508}
original_fwp: {'insert': 0.9379494190216064}
original_fwp: {'insert': 0.9379494190216064}
original_fwp: {'insert': 0.9379494190216064}
original_fwp: {'insert': 0.9379494190216064}
original_fwp: {'add': 0.5322639346122742}
original_fwp: {'add': 0.6338354349136353}
intervention_fwp: {'add': 0.2635672688484192}
original_fwp: {'insert': 0.6018227338790894}
original_fwp: {'put': 0.5134071111679077}
intervention_fwp: {'put': 0.5408995747566223}
original_fwp: {'add': 0.24001339077949524, 'clear': 0.2445792406797409}
intervention_fwp: {'add': 0.5522896647453308}
original_fwp: {'add': 0.6940090656280518}
intervention_fwp: {'add': 0.7561600208282471}
original_fwp: {'add': 0.505304217338562}
model: 2, dataset: df_LI_ori_trct_result, original_fwp: 0.0104, intervention_fwp: 0.0059
------- c=remove ---------
------- c=append ---------
++++++++ model=3 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9841232895851135}
intervention_fwp: {'add': 0.862647533416748}
intervention_fwp: {'add': 0.988174557685852, 'update': 0.0067722853273153305}
intervention_fwp: {'add': 0.40779775381088257}
intervention_fwp: {'insert': 0.5669946074485779}
intervention_fwp: {'insert': 0.5769962668418884}
intervention_fwp: {'add': 0.8137428164482117}
original_fwp: {'pop': 0.5255576372146606}
intervention_fwp: {'pop': 0.4627396762371063}
intervention_fwp: {'insert': 0.8055028915405273}
intervention_fwp: {'pop': 0.38154372572898865}
intervention_fwp: {'add': 0.9859753251075745, 'update': 0.007503072265535593}
intervention_fwp: {'insert': 0.5649457573890686}
intervention_fwp: {'insert': 0.6451932191848755}
original_fwp: {'mov': 0.2968297004699707}
intervention_fwp: {'add': 0.4025750756263733}
intervention_fwp: {'pop': 0.7836820483207703}
original_fwp: {'add': 0.8457867503166199, 'update': 0.13473466038703918}
intervention_fwp: {'add': 0.9176396131515503, 'update': 0.07542162388563156}
original_fwp: {'pop': 0.6472800970077515}
intervention_fwp: {'insert': 0.6575207710266113}
intervention_fwp: {'insert': 0.47473573684692383}
intervention_fwp: {'add': 0.9917877316474915}
intervention_fwp: {'add': 0.13527268171310425, 'dis': 0.6124354600906372}
intervention_fwp: {'pop': 0.37607723474502563}
model: 3, dataset: df_LI_ori_trct_result, original_fwp: 0.0023, intervention_fwp: 0.0113
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.6217426657676697}
intervention_fwp: {'add': 0.6467124223709106}
original_fwp: {'extend': 0.6992896795272827}
intervention_fwp: {'add': 0.33340370655059814}
original_fwp: {'put': 0.16342276334762573, 'add': 0.2494419813156128}
intervention_fwp: {'put': 0.16339479386806488, 'add': 0.14800338447093964}
intervention_fwp: {'get': 0.3950515389442444}
original_fwp: {'add': 0.8851026296615601}
original_fwp: {'add': 0.5237538814544678}
original_fwp: {'insert': 0.5168802738189697}
intervention_fwp: {'pop': 0.742258608341217}
intervention_fwp: {'pop': 0.751979649066925}
intervention_fwp: {'add': 0.4719800353050232}
intervention_fwp: {'add': 0.2928810119628906}
intervention_fwp: {'extend': 0.82419753074646}
original_fwp: {'update': 0.344739705324173}
intervention_fwp: {'get': 0.135283425450325, 'pop': 0.6009194850921631}
original_fwp: {'add': 0.5156364440917969}
original_fwp: {'add': 0.4588679373264313}
intervention_fwp: {'pop': 0.846985936164856}
original_fwp: {'add': 0.6825919151306152}
intervention_fwp: {'add': 0.44227728247642517}
original_fwp: {'add': 0.621279239654541}
intervention_fwp: {'add': 0.4862836003303528}
intervention_fwp: {'add': 0.43774569034576416}
original_fwp: {'add': 0.5336500406265259}
intervention_fwp: {'add': 0.4593619108200073}
original_fwp: {'add': 0.5022145509719849}
model: 3, dataset: df_LI_ori_trct_result, original_fwp: 0.0063, intervention_fwp: 0.0077
------- c=remove ---------
------- c=append ---------
++++++++ model=4 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.987185537815094}
intervention_fwp: {'add': 0.9116663932800293}
intervention_fwp: {'insert': 0.8607230186462402}
intervention_fwp: {'add': 0.9873583912849426, 'update': 0.006095405668020248}
intervention_fwp: {'insert': 0.5610396862030029}
intervention_fwp: {'insert': 0.73902827501297}
intervention_fwp: {'insert': 0.5659794211387634}
intervention_fwp: {'add': 0.5936259031295776}
intervention_fwp: {'insert': 0.5669643878936768}
intervention_fwp: {'add': 0.37705621123313904}
intervention_fwp: {'add': 0.9666764140129089}
intervention_fwp: {'add': 0.9809781312942505, 'update': 0.006508470047265291}
intervention_fwp: {'add': 0.8721379041671753}
intervention_fwp: {'insert': 0.5659610629081726}
intervention_fwp: {'insert': 0.6790409684181213}
intervention_fwp: {'add': 0.6935329437255859}
intervention_fwp: {'insert': 0.7190003991127014}
intervention_fwp: {'insert': 0.7590340971946716}
intervention_fwp: {'insert': 0.6788584589958191}
intervention_fwp: {'insert': 0.6996474862098694}
intervention_fwp: {'insert': 0.5058002471923828}
original_fwp: {'add': 0.7693291306495667}
intervention_fwp: {'add': 0.9387349486351013}
original_fwp: {'pop': 0.5799311995506287}
intervention_fwp: {'insert': 0.7599472403526306}
intervention_fwp: {'add': 0.56437748670578}
intervention_fwp: {'add': 0.9824458360671997}
model: 4, dataset: df_LI_ori_trct_result, original_fwp: 0.0009, intervention_fwp: 0.0122
------- c=remove ---------
------- c=append ---------
======== label=append ========
intervention_fwp: {'add': 0.6241732835769653}
original_fwp: {'add': 0.7672460079193115}
intervention_fwp: {'add': 0.8704443573951721}
original_fwp: {'add': 0.5195450186729431}
original_fwp: {'extend': 0.6220223903656006}
original_fwp: {'name': 0.04846392571926117, 'add': 0.07781700789928436, 'state': 0.08871646225452423, 'src': 0.060943353921175, 'dst': 0.05307833477854729}
intervention_fwp: {'add': 0.10126032680273056, 'state': 0.07110821455717087, 'src': 0.06366322934627533, 'dst': 0.05264831706881523}
original_fwp: {'add': 0.6187714338302612}
intervention_fwp: {'pop': 0.7426562309265137}
intervention_fwp: {'pop': 0.5936304926872253}
original_fwp: {'add': 0.5433432459831238}
intervention_fwp: {'add': 0.6682712435722351}
original_fwp: {'update': 0.37791821360588074}
intervention_fwp: {'pop': 0.47630923986434937}
intervention_fwp: {'pop': 0.5248090624809265}
original_fwp: {'insert': 0.5078334808349609}
original_fwp: {'insert': 0.5078334808349609}
original_fwp: {'insert': 0.5078334808349609}
original_fwp: {'insert': 0.5078334808349609}
original_fwp: {'add': 0.6665483117103577}
intervention_fwp: {'add': 0.5628003478050232}
original_fwp: {'add': 0.6102060079574585}
intervention_fwp: {'get': 0.47486498951911926, 'set': 0.07029098272323608, 'pop': 0.06329969316720963, 'move': 0.11561238765716553}
original_fwp: {'put': 0.4700819253921509}
intervention_fwp: {'put': 0.41693201661109924}
original_fwp: {'add': 0.6183913946151733}
intervention_fwp: {'add': 0.5997216701507568}
original_fwp: {'add': 0.490450918674469}
intervention_fwp: {'add': 0.6549684405326843}
model: 4, dataset: df_LI_ori_trct_result, original_fwp: 0.0091, intervention_fwp: 0.0086
------- c=remove ---------
------- c=append ---------
df_LI_ori_trct_no_comments_result
++++++++ model=0 ++++++++
======== label=remove ========
model: 0, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0794, intervention_fwp: 0.0590
------- c=remove ---------
------- c=append ---------
======== label=append ========
model: 0, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0939, intervention_fwp: 0.1238
------- c=remove ---------
------- c=append ---------
++++++++ model=1 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9781916737556458}
intervention_fwp: {'insert': 0.536973774433136}
intervention_fwp: {'add': 0.9598625302314758}
intervention_fwp: {'add': 0.9404275417327881, 'update': 0.02231759764254093}
intervention_fwp: {'add': 0.36918264627456665, 'discard': 0.21330168843269348}
original_fwp: {'pop': 0.5350970029830933}
intervention_fwp: {'pop': 0.5363098382949829}
intervention_fwp: {'add': 0.6928135752677917}
intervention_fwp: {'add': 0.926997721195221}
intervention_fwp: {'add': 0.8896533250808716}
intervention_fwp: {'insert': 0.5335948467254639}
intervention_fwp: {'add': 0.3874327838420868}
original_fwp: {'movable': 0.2740425169467926}
original_fwp: {'pop': 0.4863077998161316}
intervention_fwp: {'pop': 0.5015342831611633}
original_fwp: {'add': 0.4695027768611908}
intervention_fwp: {'add': 0.742496132850647, 'update': 0.10380945354700089, 'discard': 0.07937394827604294}
intervention_fwp: {'insert': 0.6385295391082764}
intervention_fwp: {'add': 0.5297046899795532}
intervention_fwp: {'add': 0.9791655540466309}
intervention_fwp: {'add': 0.3605439364910126, 'discard': 0.3919135630130768}
intervention_fwp: {'insert': 0.5987640619277954}
model: 1, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0018, intervention_fwp: 0.0104
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.48272669315338135}
intervention_fwp: {'pop': 0.5723229050636292}
intervention_fwp: {'pop': 0.8337919116020203}
original_fwp: {'add': 0.6206165552139282}
intervention_fwp: {'extend': 0.5152082443237305}
original_fwp: {'update': 0.4409290552139282}
intervention_fwp: {'update': 0.20299795269966125, 'pop': 0.6434696316719055}
intervention_fwp: {'pop': 0.8443832993507385}
original_fwp: {'insert': 0.5335869789123535}
original_fwp: {'insert': 0.5335869789123535}
original_fwp: {'insert': 0.5335869789123535}
original_fwp: {'insert': 0.5335869789123535}
original_fwp: {'add': 0.49458974599838257}
intervention_fwp: {'get': 0.33138325810432434}
original_fwp: {'insert': 0.641633927822113}
original_fwp: {'put': 0.6088629364967346}
intervention_fwp: {'put': 0.5286192297935486}
model: 1, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0045, intervention_fwp: 0.0036
------- c=remove ---------
------- c=append ---------
++++++++ model=2 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9929429888725281, 'update': 0.004268424119800329}
intervention_fwp: {'add': 0.9619088768959045, 'update': 0.028713978826999664}
intervention_fwp: {'add': 0.9430352449417114, 'update': 0.0350077785551548}
intervention_fwp: {'add': 0.7240521907806396}
intervention_fwp: {'add': 0.42464184761047363}
intervention_fwp: {'add': 0.21383248269557953, 'pop': 0.2222590148448944}
intervention_fwp: {'add': 0.9478970170021057}
intervention_fwp: {'add': 0.6938250064849854}
intervention_fwp: {'add': 0.8501530289649963}
intervention_fwp: {'insert': 0.5484541654586792}
original_fwp: {'m': 0.4517790973186493}
intervention_fwp: {'m': 0.341178834438324}
intervention_fwp: {'pop': 0.5410934686660767}
original_fwp: {'add': 0.45880287885665894, 'update': 0.23125743865966797}
intervention_fwp: {'add': 0.7399349212646484, 'disc': 0.01112061645835638, 'update': 0.22166505455970764}
intervention_fwp: {'insert': 0.6551833748817444}
intervention_fwp: {'add': 0.9857221841812134}
intervention_fwp: {'disc': 0.8745322823524475}
original_fwp: {'insert': 0.35787081718444824}
intervention_fwp: {'insert': 0.6000126004219055}
model: 2, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0018, intervention_fwp: 0.0104
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.6111046671867371}
original_fwp: {'ext': 0.5682685375213623}
original_fwp: {'ep': 0.02207188494503498, 'set': 0.07760654389858246, 'add': 0.14930889010429382, 'state': 0.026485200971364975, 'init': 0.025198614224791527, 'initial': 0.029822900891304016}
intervention_fwp: {'ep': 0.026055702939629555, 'set': 0.07236598432064056, 'add': 0.1472635269165039, 'state': 0.026330677792429924, 'update': 0.02169971354305744}
intervention_fwp: {'pop': 0.3263740837574005}
intervention_fwp: {'add': 0.3855275809764862}
original_fwp: {'set': 0.08487926423549652, 'create': 0.23385010659694672, 'update': 0.462893009185791}
intervention_fwp: {'pop': 0.5738379955291748}
intervention_fwp: {'pop': 0.5380003452301025}
original_fwp: {'insert': 0.8660899996757507}
original_fwp: {'insert': 0.8660899996757507}
original_fwp: {'insert': 0.8660899996757507}
original_fwp: {'insert': 0.8660899996757507}
original_fwp: {'insert': 0.6497363448143005}
original_fwp: {'put': 0.536738932132721}
intervention_fwp: {'put': 0.5545938014984131}
original_fwp: {'add': 0.2235245704650879, 'clear': 0.23291872441768646}
intervention_fwp: {'add': 0.43927425146102905}
original_fwp: {'add': 0.5273163318634033}
intervention_fwp: {'add': 0.5701369047164917}
model: 2, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0091, intervention_fwp: 0.0054
------- c=remove ---------
------- c=append ---------
++++++++ model=3 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9901683330535889, 'update': 0.005502750631421804}
intervention_fwp: {'add': 0.7892836332321167}
intervention_fwp: {'add': 0.9490519762039185}
intervention_fwp: {'add': 0.8100200891494751}
intervention_fwp: {'add': 0.9004435539245605}
intervention_fwp: {'add': 0.9229769706726074}
intervention_fwp: {'pop': 0.19500529766082764, 'insert': 0.5501407980918884}
intervention_fwp: {'pop': 0.5952497720718384}
original_fwp: {'add': 0.8496349453926086, 'update': 0.06900501251220703}
intervention_fwp: {'add': 0.9264359474182129, 'update': 0.052676670253276825}
intervention_fwp: {'insert': 0.5804579854011536}
intervention_fwp: {'add': 0.9904534816741943, 'update': 0.004502595402300358}
intervention_fwp: {'dis': 0.6192796230316162}
intervention_fwp: {'add': 0.21834242343902588, 'pop': 0.3972841203212738}
model: 3, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0009, intervention_fwp: 0.0082
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.5000093579292297}
original_fwp: {'extend': 0.6680612564086914}
original_fwp: {'add': 0.2518676817417145}
intervention_fwp: {'add': 0.22698503732681274}
original_fwp: {'update': 0.48877671360969543}
original_fwp: {'add': 0.5008516907691956}
intervention_fwp: {'pop': 0.8156130313873291}
original_fwp: {'extend': 0.5595085024833679}
intervention_fwp: {'extend': 0.9250568747520447}
original_fwp: {'update': 0.5191511511802673}
intervention_fwp: {'pop': 0.762586772441864}
original_fwp: {'add': 0.5746309757232666}
original_fwp: {'insert': 0.5010316967964172}
intervention_fwp: {'put': 0.4758489429950714}
original_fwp: {'add': 0.3747273087501526}
model: 3, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0045, intervention_fwp: 0.0023
------- c=remove ---------
------- c=append ---------
++++++++ model=4 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9781936407089233}
intervention_fwp: {'add': 0.895851194858551}
intervention_fwp: {'insert': 0.5301320552825928}
intervention_fwp: {'</s>': 0.0015368044842034578, 'add': 0.9880821704864502, 'update': 0.006346762180328369}
intervention_fwp: {'insert': 0.5622511506080627}
intervention_fwp: {'insert': 0.7546125650405884}
intervention_fwp: {'add': 0.5848480463027954}
intervention_fwp: {'add': 0.3137357831001282}
intervention_fwp: {'add': 0.920865535736084}
intervention_fwp: {'add': 0.9759371280670166}
intervention_fwp: {'add': 0.7197677493095398}
intervention_fwp: {'insert': 0.6331783533096313}
intervention_fwp: {'add': 0.6879061460494995}
original_fwp: {'mov': 0.28922703862190247}
original_fwp: {'add': 0.6189495921134949}
intervention_fwp: {'add': 0.8671256899833679}
intervention_fwp: {'insert': 0.7028546333312988}
intervention_fwp: {'add': 0.9901345372200012, 'update': 0.003797160694375634}
intervention_fwp: {'add': 0.7200026512145996}
model: 4, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0009, intervention_fwp: 0.0091
------- c=remove ---------
------- c=append ---------
======== label=append ========
intervention_fwp: {'add': 0.4759240746498108}
original_fwp: {'extend': 0.7054625153541565}
original_fwp: {'name': 0.04596017301082611, 'add': 0.08833552151918411, 'state': 0.07008557766675949, 'epsilon': 0.0633317157626152}
intervention_fwp: {'add': 0.10739389806985855, 'state': 0.06004687026143074, 'epsilon': 0.060521576553583145}
intervention_fwp: {'pop': 0.6535822749137878}
intervention_fwp: {'pop': 0.6826566457748413}
original_fwp: {'add': 0.6281572580337524}
intervention_fwp: {'add': 0.43194612860679626}
intervention_fwp: {'put': 0.45659127831459045}
original_fwp: {'update': 0.37722715735435486}
intervention_fwp: {'pop': 0.530626118183136}
original_fwp: {'insert': 0.5167708992958069}
original_fwp: {'insert': 0.5167708992958069}
original_fwp: {'insert': 0.5167708992958069}
original_fwp: {'insert': 0.5167708992958069}
intervention_fwp: {'get': 0.3246886432170868}
original_fwp: {'insert': 0.5152035355567932}
original_fwp: {'put': 0.5360047221183777}
intervention_fwp: {'put': 0.5523877739906311}
intervention_fwp: {'add': 0.5797390341758728}
model: 4, dataset: df_LI_ori_trct_no_comments_result, original_fwp: 0.0059, intervention_fwp: 0.0054
------- c=remove ---------
------- c=append ---------
df_LI_pure_trct_result
++++++++ model=0 ++++++++
======== label=remove ========
model: 0, dataset: df_LI_pure_trct_result, original_fwp: 0.4689, intervention_fwp: 0.4798
------- c=remove ---------
------- c=append ---------
======== label=append ========
model: 0, dataset: df_LI_pure_trct_result, original_fwp: 2.2027, intervention_fwp: 2.5184
------- c=remove ---------
------- c=append ---------
++++++++ model=1 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.7307244539260864}
intervention_fwp: {'add': 0.7180950045585632}
original_fwp: {'close': 0.1956019103527069}
original_fwp: {'discard': 0.4486599266529083}
intervention_fwp: {'discard': 0.3367103636264801}
intervention_fwp: {'add': 0.6821617484092712}
original_fwp: {'add': 0.18533383309841156, 'update': 0.05980478599667549}
intervention_fwp: {'add': 0.44213995337486267}
intervention_fwp: {'add': 0.7303839921951294}
original_fwp: {'py': 0.3794572949409485}
original_fwp: {'iv': 0.015009230934083462, 'name': 0.013339915312826633, 'other': 0.017338106408715248, 'x': 0.02180517092347145}
intervention_fwp: {'iv': 0.026286141946911812, 'add': 0.02214815653860569, 'other': 0.0378478541970253}
original_fwp: {'replace': 0.24641455709934235}
original_fwp: {'add': 0.08509468287229538}
intervention_fwp: {'add': 0.7490432262420654}
intervention_fwp: {'add': 0.7619584202766418}
model: 1, dataset: df_LI_pure_trct_result, original_fwp: 0.0050, intervention_fwp: 0.0050
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'set': 0.09141473472118378, 'add': 0.05025344341993332, 'init': 0.030260827392339706}
intervention_fwp: {'set': 0.0870344415307045, 'add': 0.05231514945626259, 'start': 0.024521486833691597, 'init': 0.028115738183259964}
original_fwp: {'update': 0.30818861722946167}
intervention_fwp: {'pop': 0.36652660369873047}
intervention_fwp: {'children': 0.2433655709028244}
intervention_fwp: {'children': 0.2433655709028244}
original_fwp: {'write': 0.07658201456069946, 'insert': 0.05857652425765991, 'underline': 0.325810045003891}
intervention_fwp: {'\n': 0.05623655021190643, 'py': 0.06034728139638901, 'write': 0.04261636361479759, 'replace': 0.054793186485767365, 'underline': 0.20981952548027039}
original_fwp: {'set': 0.06387142091989517, 'add': 0.052326783537864685}
intervention_fwp: {'set': 0.052974995225667953}
intervention_fwp: {'students': 0.2629130780696869}
original_fwp: {'add': 0.5020744204521179}
intervention_fwp: {'add': 0.3399837613105774}
intervention_fwp: {'update': 0.2472241222858429, 'pop': 0.2413516640663147}
intervention_fwp: {'push': 0.5461854934692383}
original_fwp: {'add': 0.519639253616333}
intervention_fwp: {'add': 0.4143705666065216}
original_fwp: {'on': 0.014938630163669586, 'is': 0.010039886459708214, 'set': 0.023104945197701454, 'text': 0.012663556262850761, 'add': 0.013805543072521687, 'data': 0.009813075885176659, 'type': 0.007203651126474142, 'message': 0.01197498757392168, 'post': 0.00871934462338686, 'tag': 0.009552465751767159, 'content': 0.012959908694028854, 'style': 0.016047120094299316, 'child': 0.008739657700061798, 'author': 0.03915886953473091, 'update': 0.007081587333232164, 'children': 0.23787789046764374, 'tags': 0.009273412637412548, 'className': 0.00715386588126421}
intervention_fwp: {'on': 0.012395947240293026, 'is': 0.013577979989349842, 'set': 0.020454933866858482, 'text': 0.017386386170983315, 'add': 0.007883011363446712, 'data': 0.010163077153265476, 'type': 0.007485325913876295, 'message': 0.006013435311615467, 'value': 0.009057659655809402, 'title': 0.006600990891456604, 'post': 0.007627430837601423, 'tag': 0.00947960838675499, 'content': 0.015554051846265793, 'show': 0.0065304613672196865, 'parent': 0.007357142865657806, 'style': 0.019826961681246758, 'child': 0.010812267661094666, 'author': 0.07468017190694809, 'render': 0.005554691422730684, 'children': 0.21222525835037231, 'className': 0.006767331622540951, 'signature': 0.023902932181954384}
original_fwp: {'add': 0.6652109026908875}
intervention_fwp: {'get': 0.48839107155799866}
intervention_fwp: {'add': 0.43372631072998047}
original_fwp: {'id': 0.028056442737579346, 'message': 0.02051306888461113, 'error': 0.09331754595041275, 'body': 0.029308609664440155, 'code': 0.03576856851577759, 'status': 0.3117424249649048, 'json': 0.03994140401482582, 'write': 0.08338996767997742, 'render': 0.031225522980093956, 'send': 0.02707502990961075, 'headers': 0.011848164722323418, 'redirect': 0.01689162291586399}
intervention_fwp: {'id': 0.08822408318519592, 'body': 0.03806822746992111, 'status': 0.11175651103258133, 'write': 0.1804105043411255, 'render': 0.04708264395594597, 'send': 0.03888990357518196, 'redirect': 0.030644740909337997}
original_fwp: {'extend': 0.49307048320770264}
intervention_fwp: {'extend': 0.5012409687042236}
intervention_fwp: {'write': 0.6867082715034485}
model: 1, dataset: df_LI_pure_trct_result, original_fwp: 0.0195, intervention_fwp: 0.0236
------- c=remove ---------
------- c=append ---------
++++++++ model=2 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9284368753433228}
intervention_fwp: {'add': 0.8140295743942261}
original_fwp: {'add': 0.40118342638015747}
intervention_fwp: {'insert': 0.6820997595787048}
intervention_fwp: {'insert': 0.5142550468444824}
intervention_fwp: {'add': 0.84641033411026, 'update': 0.07544368505477905}
intervention_fwp: {'add': 0.916498601436615}
original_fwp: {'__': 0.8898828625679016}
intervention_fwp: {'__': 0.8850440979003906, 'add': 0.07967843115329742}
original_fwp: {'pop': 0.4579015076160431}
intervention_fwp: {'add': 0.796055018901825}
original_fwp: {'iv': 0.05508105084300041, '__': 0.05553677678108215, 'add': 0.055541861802339554}
intervention_fwp: {'iv': 0.03943067789077759, '__': 0.044303182512521744, 'other': 0.01978302001953125, 'set': 0.017826439812779427, 'add': 0.0549306683242321, 'log': 0.043820954859256744, 'assert': 0.03413951024413109, 'fail': 0.10553847253322601}
original_fwp: {'update': 0.4539138674736023}
intervention_fwp: {'update': 0.40000206232070923}
original_fwp: {'m': 0.34667590260505676}
intervention_fwp: {'m': 0.30691033601760864}
intervention_fwp: {'insert': 0.7289740443229675}
intervention_fwp: {'insert': 0.7289740443229675}
intervention_fwp: {'insert': 0.7289740443229675}
intervention_fwp: {'insert': 0.7289740443229675}
intervention_fwp: {'insert': 0.7289740443229675}
intervention_fwp: {'insert': 0.7289740443229675}
intervention_fwp: {'insert': 0.5142550468444824}
original_fwp: {'add': 0.5304600596427917, 'update': 0.2888906002044678}
intervention_fwp: {'add': 0.6857105493545532, 'update': 0.2012610137462616}
intervention_fwp: {'add': 0.8481905460357666}
intervention_fwp: {'add': 0.49164947867393494}
intervention_fwp: {'add': 0.4821379482746124}
original_fwp: {'replace': 0.5304045081138611}
intervention_fwp: {'add': 0.6555558443069458}
intervention_fwp: {'add': 0.5440947413444519}
intervention_fwp: {'add': 0.6555558443069458}
intervention_fwp: {'add': 0.5440947413444519}
intervention_fwp: {'add': 0.735165536403656}
intervention_fwp: {'add': 0.952078640460968}
intervention_fwp: {'add': 0.6114962100982666}
model: 2, dataset: df_LI_pure_trct_result, original_fwp: 0.0050, intervention_fwp: 0.0177
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'set': 0.06903577595949173, 'add': 0.10385660082101822}
intervention_fwp: {'set': 0.07849220931529999, 'add': 0.0817214846611023}
original_fwp: {'insert': 0.8205035924911499}
original_fwp: {'ext': 0.2936650514602661}
intervention_fwp: {'ext': 0.21477928757667542, 'Make': 0.25360432267189026}
intervention_fwp: {'pop': 0.334502249956131}
original_fwp: {'insert': 0.6129054427146912}
original_fwp: {'insert': 0.6129054427146912}
intervention_fwp: {'pop': 0.6352652907371521}
original_fwp: {'add': 0.5921764969825745}
original_fwp: {'set': 0.125126451253891, 'under': 0.08971015363931656, 'update': 0.4023844599723816}
intervention_fwp: {'set': 0.24364158511161804, 'pop': 0.16541427373886108, 'update': 0.150361105799675}
intervention_fwp: {'pop': 0.9512971639633179}
original_fwp: {'add': 0.16844120621681213, 'Add': 0.20597735047340393, 'update': 0.034897446632385254, 'decl': 0.07297876477241516}
intervention_fwp: {'re': 0.10972855240106583, 'set': 0.04881934076547623, 'add': 0.08585335314273834, 'Add': 0.06012307479977608, 'var': 0.047455791383981705, 'update': 0.05968068912625313, 'decl': 0.06611696630716324}
intervention_fwp: {'pop': 0.40568646788597107}
intervention_fwp: {'pop': 0.42333126068115234}
original_fwp: {'add': 0.4462141692638397}
intervention_fwp: {'add': 0.43148666620254517}
original_fwp: {'add': 0.6597766876220703}
intervention_fwp: {'pop': 0.6897398829460144}
original_fwp: {'insert': 0.7412012815475464}
original_fwp: {'insert': 0.7412012815475464}
original_fwp: {'insert': 0.7412012815475464}
original_fwp: {'insert': 0.7412012815475464}
original_fwp: {'add': 0.9417262077331543}
intervention_fwp: {'add': 0.4620085656642914}
original_fwp: {'add': 0.34793955087661743}
original_fwp: {'insert': 0.6820997595787048}
intervention_fwp: {'add': 0.45731306076049805}
original_fwp: {'add': 0.6540526151657104}
intervention_fwp: {'pop': 0.455486536026001, 'update': 0.31203046441078186}
original_fwp: {'insert': 0.6437975764274597}
original_fwp: {'add': 0.4821379482746124}
original_fwp: {'add': 0.554705023765564}
intervention_fwp: {'add': 0.6594126224517822}
original_fwp: {'update': 0.19381378591060638}
original_fwp: {'add': 0.48255589604377747}
model: 2, dataset: df_LI_pure_trct_result, original_fwp: 0.0132, intervention_fwp: 0.0118
------- c=remove ---------
------- c=append ---------
++++++++ model=3 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.940187931060791}
intervention_fwp: {'add': 0.9137312173843384}
original_fwp: {'add': 0.4122893512248993}
intervention_fwp: {'add': 0.7536669373512268}
original_fwp: {'pop': 0.5009499788284302}
intervention_fwp: {'insert': 0.4836854934692383}
original_fwp: {'add': 0.31184449791908264, 'update': 0.3019008934497833}
intervention_fwp: {'add': 0.6067508459091187, 'update': 0.2756552994251251}
intervention_fwp: {'add': 0.4662765562534332}
intervention_fwp: {'add': 0.8948740363121033}
original_fwp: {'add': 0.03468566760420799, '__': 0.8933820128440857}
intervention_fwp: {'add': 0.10904537886381149, '__': 0.8067973852157593}
intervention_fwp: {'add': 0.8142315745353699}
original_fwp: {'iv': 0.010372788645327091, 'set': 0.014888517558574677, 'log': 0.009714159183204174, 'add': 0.017429830506443977, 'other': 0.011672098189592361, 'data': 0.01703854463994503, 'assert': 0.20769155025482178, 'logger': 0.01015095692127943}
intervention_fwp: {'in': 0.005805280525237322, 'iv': 0.030186310410499573, 'set': 0.010248380713164806, 'log': 0.011497536674141884, 'add': 0.031494345515966415, 'other': 0.020895015448331833, 'data': 0.020700717344880104, 'error': 0.00515549024567008, 'write': 0.010785454884171486, 'state': 0.008299676701426506, 'graph': 0.009051620028913021, 'update': 0.006960161030292511, 'items': 0.009128042496740818, 'cache': 0.00604815362021327, 'assert': 0.10759284347295761, 'errors': 0.0054178498685359955, 'fail': 0.010801760479807854, 'logger': 0.012809120118618011, 'd': 0.0057617295533418655, 'p': 0.005458301864564419, 'v': 0.005902641918510199}
original_fwp: {'update': 0.6840522289276123}
intervention_fwp: {'update': 0.5525150895118713}
intervention_fwp: {'mov': 0.2539083659648895}
original_fwp: {'update': 0.16817772388458252, 'pop': 0.34646984934806824}
intervention_fwp: {'update': 0.5748257637023926}
intervention_fwp: {'pop': 0.5585528612136841}
original_fwp: {'add': 0.5957733392715454}
intervention_fwp: {'add': 0.7052223682403564}
original_fwp: {'pop': 0.6748833060264587}
intervention_fwp: {'add': 0.8553111553192139}
intervention_fwp: {'add': 0.6016659140586853}
original_fwp: {'replace': 0.7043213844299316}
intervention_fwp: {'add': 0.5901064276695251}
intervention_fwp: {'add': 0.6631190180778503}
intervention_fwp: {'add': 0.5901064276695251}
intervention_fwp: {'add': 0.6631190180778503}
intervention_fwp: {'add': 0.915987491607666}
intervention_fwp: {'add': 0.5189983248710632}
model: 3, dataset: df_LI_pure_trct_result, original_fwp: 0.0091, intervention_fwp: 0.0204
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.7104085683822632}
original_fwp: {'put': 0.07732243835926056, 'set': 0.06325382739305496, 'add': 0.11503443121910095}
intervention_fwp: {'put': 0.07283864170312881, 'set': 0.07446505129337311, 'add': 0.09992159157991409, 'start': 0.04037274420261383}
intervention_fwp: {'pop': 0.1346243917942047, 'move': 0.20849284529685974}
intervention_fwp: {'update': 0.35250210762023926, 'pop': 0.21258991956710815}
original_fwp: {'add': 0.274183452129364}
intervention_fwp: {'pop': 0.6192936301231384}
intervention_fwp: {'pop': 0.5009499788284302}
original_fwp: {'update': 0.5480363965034485}
intervention_fwp: {'replace': 0.5153729915618896}
original_fwp: {'up': 0.052005600184202194, 'update': 0.9400563836097717}
intervention_fwp: {'st': 0.012601735070347786, 'get': 0.1911633163690567, 'update': 0.07920563966035843, 'pop': 0.5046040415763855, 's': 0.0471598282456398}
intervention_fwp: {'pop': 0.357448548078537}
original_fwp: {'add': 0.7554248571395874}
original_fwp: {'add': 0.41776931285858154}
original_fwp: {'add': 0.8470052480697632}
original_fwp: {'update': 0.42128947377204895}
intervention_fwp: {'update': 0.12759795784950256, 'pop': 0.6418160796165466}
original_fwp: {'insert': 0.5769050717353821}
original_fwp: {'insert': 0.5769050717353821}
original_fwp: {'insert': 0.5769050717353821}
original_fwp: {'insert': 0.5769050717353821}
original_fwp: {'add': 0.965018093585968}
original_fwp: {'add': 0.6798717975616455}
intervention_fwp: {'add': 0.703541100025177}
original_fwp: {'add': 0.5468127131462097}
original_fwp: {'add': 0.9376652836799622}
intervention_fwp: {'pop': 0.37724658846855164}
intervention_fwp: {'pop': 0.6748833060264587}
original_fwp: {'insert': 0.5471838116645813}
original_fwp: {'put': 0.4452872574329376}
intervention_fwp: {'put': 0.4091254770755768}
original_fwp: {'extend': 0.4178018867969513}
intervention_fwp: {'extend': 0.4218408763408661}
original_fwp: {'update': 0.549670934677124}
intervention_fwp: {'pop': 0.4302120804786682}
original_fwp: {'add': 0.33044907450675964}
original_fwp: {'add': 0.4234217405319214}
intervention_fwp: {'add': 0.3563505709171295}
intervention_fwp: {'add': 0.3335952162742615}
model: 3, dataset: df_LI_pure_trct_result, original_fwp: 0.0118, intervention_fwp: 0.0122
------- c=remove ---------
------- c=append ---------
++++++++ model=4 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.7447202205657959}
original_fwp: {'auto': 0.4315970838069916, 'pop': 0.24709439277648926}
intervention_fwp: {'add': 0.7341050505638123}
original_fwp: {'pop': 0.5058456063270569}
intervention_fwp: {'add': 0.49942469596862793}
intervention_fwp: {'add': 0.429625928401947}
intervention_fwp: {'add': 0.8765988945960999}
intervention_fwp: {'add': 0.5221789479255676, '__': 0.21933947503566742, 'update': 0.059982266277074814}
intervention_fwp: {'add': 0.7895264625549316}
original_fwp: {'iv': 0.08110073208808899, 'set': 0.029924338683485985, 'add': 0.026292992755770683, 'other': 0.04706772044301033, 'assert': 0.02049216255545616}
intervention_fwp: {'iv': 0.045705005526542664, 'add': 0.04289843142032623, 'other': 0.06569023430347443}
original_fwp: {'update': 0.6606190204620361}
intervention_fwp: {'update': 0.6923558115959167}
original_fwp: {'mov': 0.2911413908004761}
intervention_fwp: {'mov': 0.23420555889606476}
intervention_fwp: {'insert': 0.6995643973350525}
intervention_fwp: {'insert': 0.6995643973350525}
intervention_fwp: {'insert': 0.6995643973350525}
intervention_fwp: {'insert': 0.6995643973350525}
intervention_fwp: {'insert': 0.6995643973350525}
intervention_fwp: {'insert': 0.6995643973350525}
intervention_fwp: {'add': 0.579572319984436}
intervention_fwp: {'add': 0.5642294883728027}
original_fwp: {'pop': 0.38552218675613403}
intervention_fwp: {'add': 0.49842017889022827}
intervention_fwp: {'add': 0.49842017889022827}
intervention_fwp: {'add': 0.5796363353729248}
intervention_fwp: {'add': 0.855521559715271}
intervention_fwp: {'add': 0.6390576362609863}
model: 4, dataset: df_LI_pure_trct_result, original_fwp: 0.0050, intervention_fwp: 0.0122
------- c=remove ---------
------- c=append ---------
======== label=append ========
intervention_fwp: {'get': 0.28496912121772766}
original_fwp: {'add': 0.5841529965400696}
intervention_fwp: {'add': 0.534379243850708}
original_fwp: {'set': 0.1138308197259903, 'add': 0.14487257599830627, 'eps': 0.1289530098438263}
intervention_fwp: {'set': 0.10592195391654968, 'add': 0.14684252440929413, 'eps': 0.11323673278093338}
original_fwp: {'add': 0.4213337302207947, 'update': 0.1918252557516098}
intervention_fwp: {'pop': 0.17673268914222717}
intervention_fwp: {'pop': 0.5476714372634888}
intervention_fwp: {'pop': 0.4571877121925354}
intervention_fwp: {'pop': 0.5663975477218628}
intervention_fwp: {'pop': 0.5058456063270569}
original_fwp: {'add': 0.6341819167137146}
intervention_fwp: {'add': 0.632655143737793}
intervention_fwp: {'pop': 0.5466986894607544}
original_fwp: {'set': 0.03138573467731476, 'add': 0.04101185128092766, 'new': 0.0332627147436142, 'under': 0.04188583791255951, 'update': 0.14990542829036713, 'merge': 0.018670760095119476, 'underline': 0.43113040924072266}
intervention_fwp: {'pop': 0.4722045063972473, 'underline': 0.12375050783157349}
original_fwp: {'set': 0.05650356411933899, 'add': 0.13568998873233795, 'new': 0.04421784356236458, 'var': 0.03722439706325531, 'update': 0.023657137528061867, 'push': 0.02222740650177002, 'dec': 0.05482349917292595, 'vars': 0.07017497718334198, 'variables': 0.026284392923116684}
intervention_fwp: {'set': 0.08021119982004166, 'add': 0.07030027359724045, 'new': 0.011007776483893394, 'var': 0.058541785925626755, 'check': 0.01504240371286869, 'write': 0.013564431108534336, 'update': 0.05335172638297081, 'push': 0.019262440502643585, 'dec': 0.05800417438149452, 'define': 0.014799593016505241, 'register': 0.021563654765486717, 'vars': 0.07507872581481934, 'variables': 0.022544313222169876}
original_fwp: {'update': 0.37441152334213257}
intervention_fwp: {'pop': 0.7290171384811401}
original_fwp: {'insert': 0.5058579444885254}
original_fwp: {'insert': 0.5058579444885254}
original_fwp: {'insert': 0.5058579444885254}
original_fwp: {'insert': 0.5058579444885254}
original_fwp: {'add': 0.9310867190361023}
original_fwp: {'add': 0.4504271149635315}
original_fwp: {'add': 0.5556736588478088}
original_fwp: {'add': 0.30024057626724243}
intervention_fwp: {'pop': 0.4497368037700653}
intervention_fwp: {'pop': 0.38552218675613403}
original_fwp: {'insert': 0.5399940609931946}
original_fwp: {'put': 0.46688178181648254}
intervention_fwp: {'put': 0.4404583275318146}
intervention_fwp: {'pop': 0.5333792567253113}
original_fwp: {'add': 0.5767186284065247}
model: 4, dataset: df_LI_pure_trct_result, original_fwp: 0.0159, intervention_fwp: 0.0145
------- c=remove ---------
------- c=append ---------
"""

# Define the words to exclude
exclude_words = {"remove", "append", "model", "dataset", "label","original_fwp", "intervention_fwp", 
                 "c","df_LI_ori_trct_no_comments_result", "df_LI_pure_trct_result", "df_LI_pure_trct_result"}

# Use regex to extract all words
pattern = r'\b\w+\b'
words = re.findall(pattern, data)

# Function to check if a word is a number
def is_number(word):
    try:
        float(word)  # Try to convert to a float
        return True
    except ValueError:
        return False

# Filter out excluded words and numbers
filtered_words = [word for word in words if word.lower() not in exclude_words and not is_number(word)]
word_counts = Counter(filtered_words)

# Get the most common word and the number of unique words
most_common_word = word_counts.most_common(1)
total_unique_words = len(word_counts)

# Display results
print(f"Most common word: {most_common_word}")
print(f"Total unique words: {total_unique_words}")
print(f"Top 10 most frequent words (excluding certain words): {word_counts.most_common(5)}")

Most common word: [('add', 309)]
Total unique words: 105
Top 10 most frequent words (excluding certain words): [('add', 309), ('insert', 114), ('pop', 95), ('update', 79), ('set', 28)]


In [77]:
import re
from collections import Counter

# Replace this string with the actual content from your dataset
data = """
df_LI_ori_full_result
++++++++ model=1 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9861438274383545}
intervention_fwp: {'add': 0.3997058868408203}
intervention_fwp: {'add': 0.8797391057014465}
intervention_fwp: {'add': 0.9806388020515442, 'update': 0.010261326096951962}
intervention_fwp: {'add': 0.8170274496078491}
intervention_fwp: {'add': 0.6259328722953796}
intervention_fwp: {'add': 0.9494974613189697}
intervention_fwp: {'add': 0.39489635825157166}
intervention_fwp: {'insert': 0.37495097517967224}
original_fwp: {'cancel': 0.16407868266105652}
intervention_fwp: {'cancel': 0.15853558480739594}
intervention_fwp: {'add': 0.41234755516052246}
intervention_fwp: {'insert': 0.6947551369667053}
intervention_fwp: {'add': 0.8029362559318542}
intervention_fwp: {'add': 0.8797781467437744}
intervention_fwp: {'insert': 0.5921135544776917}
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.7041010856628418}
original_fwp: {'add': 0.9688224196434021}
intervention_fwp: {'add': 0.9815187454223633}
original_fwp: {'add': 0.8174375891685486}
original_fwp: {'extend': 0.510227382183075}
original_fwp: {'add': 0.5977165699005127}
intervention_fwp: {'add': 0.581664502620697}
original_fwp: {'add': 0.8052302002906799}
original_fwp: {'add': 0.9353036284446716}
original_fwp: {'add': 0.7602394223213196}
original_fwp: {'add': 0.7516935467720032}
intervention_fwp: {'pop': 0.44526466727256775}
original_fwp: {'add': 0.8881365060806274}
intervention_fwp: {'add': 0.6659936308860779}
original_fwp: {'add': 0.6470773220062256}
intervention_fwp: {'add': 0.5326777696609497}
original_fwp: {'add': 0.5695221424102783}
original_fwp: {'add': 0.5077329874038696}
original_fwp: {'add': 0.6385462880134583}
original_fwp: {'add': 0.4278179109096527}
intervention_fwp: {'pop': 0.45631101727485657}
original_fwp: {'add': 0.4603258967399597}
intervention_fwp: {'add': 0.376981258392334}
original_fwp: {'add': 0.6067583560943604}
intervention_fwp: {'add': 0.5522392988204956}
original_fwp: {'add': 0.6315104365348816}
original_fwp: {'add': 0.25237101316452026, 'pop': 0.25291457772254944}
intervention_fwp: {'pop': 0.7389711141586304}
intervention_fwp: {'add': 0.5818426609039307}
original_fwp: {'add': 0.5605104565620422}
original_fwp: {'add': 0.6717709302902222}
intervention_fwp: {'add': 0.6946812272071838}
intervention_fwp: {'add': 0.5691129565238953}
original_fwp: {'add': 0.5491039752960205}
original_fwp: {'add': 0.9150079488754272}
original_fwp: {'add': 0.6551560759544373}
original_fwp: {'add': 0.8085636496543884}
intervention_fwp: {'add': 0.7262106537818909}
original_fwp: {'add': 0.7633610367774963}
intervention_fwp: {'add': 0.7112000584602356}
original_fwp: {'add': 0.5250813364982605}
intervention_fwp: {'add': 0.48215821385383606}
original_fwp: {'put': 0.45903894305229187}
intervention_fwp: {'put': 0.5242992043495178}
original_fwp: {'add': 0.6342307329177856}
intervention_fwp: {'add': 0.36022287607192993}
original_fwp: {'add': 0.6244564652442932}
original_fwp: {'clear': 0.5188520550727844}
original_fwp: {'add': 0.4499640464782715}
original_fwp: {'add': 0.6962037086486816}
intervention_fwp: {'add': 0.4107723534107208}
------- c=remove ---------
------- c=append ---------
++++++++ model=4 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9338160753250122}
intervention_fwp: {'insert': 0.5680399537086487}
intervention_fwp: {'add': 0.9097412824630737}
intervention_fwp: {'add': 0.984567403793335}
intervention_fwp: {'add': 0.394367516040802}
original_fwp: {'pop': 0.5548143982887268}
intervention_fwp: {'add': 0.9516754746437073}
intervention_fwp: {'insert': 0.5301755666732788}
intervention_fwp: {'insert': 0.5948013663291931}
intervention_fwp: {'insert': 0.6593859195709229}
intervention_fwp: {'insert': 0.5309407114982605}
original_fwp: {'add': 0.511887788772583}
intervention_fwp: {'add': 0.8363267183303833, 'dis': 0.07902999222278595}
original_fwp: {'pop': 0.600420355796814}
intervention_fwp: {'add': 0.4615928530693054}
intervention_fwp: {'insert': 0.5888818502426147}
intervention_fwp: {'add': 0.8996500372886658}
intervention_fwp: {'dis': 0.7159737348556519}
intervention_fwp: {'add': 0.8967579007148743}
------- c=remove ---------
------- c=append ---------
======== label=append ========
intervention_fwp: {'add': 0.5763678550720215}
original_fwp: {'add': 0.7764259576797485}
intervention_fwp: {'add': 0.8871493339538574}
intervention_fwp: {'add': 0.3415398895740509}
original_fwp: {'add': 0.49004533886909485}
intervention_fwp: {'pop': 0.7790586352348328}
intervention_fwp: {'pop': 0.5542710423469543}
intervention_fwp: {'add': 0.599001944065094}
original_fwp: {'update': 0.4732375144958496}
intervention_fwp: {'pop': 0.7891442179679871}
intervention_fwp: {'pop': 0.7236312627792358}
intervention_fwp: {'index': 0.5499401092529297}
original_fwp: {'add': 0.5558591485023499}
intervention_fwp: {'add': 0.5564286112785339}
original_fwp: {'add': 0.7186375856399536}
intervention_fwp: {'get': 0.511501669883728, 'set': 0.06214878708124161, 'update': 0.10231467336416245, 'pop': 0.09911677241325378}
original_fwp: {'put': 0.570513129234314}
intervention_fwp: {'put': 0.5346524119377136}
------- c=remove ---------
------- c=append ---------
df_LI_ori_full_no_comments_result
++++++++ model=1 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.9821317195892334}
intervention_fwp: {'add': 0.47119536995887756}
intervention_fwp: {'add': 0.9241858720779419}
intervention_fwp: {'add': 0.979176938533783, 'update': 0.013805538415908813, 'discard': 0.003177391365170479}
intervention_fwp: {'add': 0.3775925040245056}
intervention_fwp: {'add': 0.8572362661361694}
intervention_fwp: {'add': 0.4874364733695984}
intervention_fwp: {'add': 0.612821102142334}
intervention_fwp: {'add': 0.6403253078460693}
intervention_fwp: {'add': 0.9021339416503906}
intervention_fwp: {'add': 0.7981094121932983}
intervention_fwp: {'add': 0.5150665044784546}
intervention_fwp: {'add': 0.47012659907341003}
intervention_fwp: {'insert': 0.32042425870895386}
intervention_fwp: {'add': 0.5540226101875305}
intervention_fwp: {'add': 0.5174457430839539}
intervention_fwp: {'add': 0.3871724605560303}
intervention_fwp: {'insert': 0.5173981785774231}
intervention_fwp: {'add': 0.9155519008636475}
intervention_fwp: {'add': 0.3374543786048889}
intervention_fwp: {'add': 0.8862939476966858}
intervention_fwp: {'insert': 0.5954159498214722}
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.6891977787017822}
original_fwp: {'add': 0.8290351629257202}
intervention_fwp: {'add': 0.7542498111724854}
original_fwp: {'add': 0.5693124532699585}
original_fwp: {'add': 0.5724673271179199}
intervention_fwp: {'add': 0.5804336071014404}
original_fwp: {'add': 0.779615581035614}
original_fwp: {'add': 0.8375365734100342}
original_fwp: {'add': 0.7315745949745178}
original_fwp: {'add': 0.7222445607185364}
intervention_fwp: {'pop': 0.4731707274913788}
original_fwp: {'add': 0.9201357364654541}
original_fwp: {'add': 0.5952513217926025}
intervention_fwp: {'add': 0.3393113911151886}
original_fwp: {'add': 0.5195792317390442}
original_fwp: {'add': 0.6740632653236389}
original_fwp: {'add': 0.411783367395401}
intervention_fwp: {'pop': 0.46532952785491943}
intervention_fwp: {'add': 0.37953051924705505}
original_fwp: {'add': 0.6923221349716187}
original_fwp: {'add': 0.71134352684021}
intervention_fwp: {'add': 0.7527459859848022}
original_fwp: {'add': 0.6743127703666687}
original_fwp: {'add': 0.38108086585998535}
intervention_fwp: {'pop': 0.5168314576148987}
original_fwp: {'add': 0.5873278975486755}
intervention_fwp: {'add': 0.5845529437065125}
original_fwp: {'add': 0.6083191633224487}
original_fwp: {'add': 0.7599453330039978}
original_fwp: {'add': 0.6337910890579224}
original_fwp: {'add': 0.9070636630058289}
original_fwp: {'add': 0.5073329210281372}
original_fwp: {'add': 0.819183349609375}
original_fwp: {'add': 0.7406485080718994}
intervention_fwp: {'add': 0.45861169695854187}
original_fwp: {'add': 0.49437811970710754}
original_fwp: {'add': 0.5311265587806702}
intervention_fwp: {'add': 0.5861382484436035}
original_fwp: {'add': 0.708745539188385}
intervention_fwp: {'add': 0.5911656618118286}
original_fwp: {'add': 0.5230812430381775}
intervention_fwp: {'add': 0.4299483895301819}
original_fwp: {'add': 0.6171647310256958}
original_fwp: {'put': 0.5849779844284058}
intervention_fwp: {'put': 0.6572856903076172}
intervention_fwp: {'add': 0.44238460063934326}
original_fwp: {'add': 0.8230077624320984}
original_fwp: {'add': 0.7136648893356323}
original_fwp: {'clear': 0.4536531865596771}
original_fwp: {'add': 0.4578307867050171}
intervention_fwp: {'add': 0.4551982283592224}
original_fwp: {'add': 0.5415171384811401}
intervention_fwp: {'add': 0.3561682403087616}
------- c=remove ---------
------- c=append ---------
++++++++ model=4 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.7878040075302124}
intervention_fwp: {'add': 0.8767528533935547}
intervention_fwp: {'add': 0.9885901808738708, 'dis': 0.0017970354529097676, 'update': 0.005871654488146305}
original_fwp: {'pop': 0.5423898100852966}
intervention_fwp: {'add': 0.9406726956367493}
intervention_fwp: {'insert': 0.501537024974823}
intervention_fwp: {'insert': 0.5614913702011108}
intervention_fwp: {'insert': 0.5041330456733704}
original_fwp: {'add': 0.6290820240974426}
intervention_fwp: {'add': 0.8441379070281982, 'dis': 0.07770510017871857}
intervention_fwp: {'insert': 0.6322267651557922}
intervention_fwp: {'add': 0.31231966614723206}
intervention_fwp: {'insert': 0.6297119855880737}
intervention_fwp: {'add': 0.9613640308380127}
intervention_fwp: {'add': 0.16569481790065765, 'dis': 0.5158516764640808}
intervention_fwp: {'add': 0.9067719578742981}
------- c=remove ---------
------- c=append ---------
======== label=append ========
intervention_fwp: {'add': 0.4965062439441681}
original_fwp: {'add': 0.5013487935066223}
intervention_fwp: {'pop': 0.6575992107391357}
intervention_fwp: {'pop': 0.6617828011512756}
original_fwp: {'add': 0.5484610199928284}
original_fwp: {'update': 0.38444626331329346}
intervention_fwp: {'pop': 0.7648675441741943}
intervention_fwp: {'pop': 0.5893438458442688}
original_fwp: {'add': 0.6029345393180847}
intervention_fwp: {'get': 0.3903411030769348, 'pop': 0.2165026068687439}
original_fwp: {'put': 0.5723193883895874}
intervention_fwp: {'put': 0.5582693815231323}
------- c=remove ---------
------- c=append ---------
df_LI_pure_full_result
++++++++ model=1 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.8507484793663025}
intervention_fwp: {'add': 0.5167202949523926}
intervention_fwp: {'add': 0.9536956548690796}
original_fwp: {'__': 0.4059888422489166}
intervention_fwp: {'__': 0.3708905875682831, 'add': 0.2711113691329956}
intervention_fwp: {'add': 0.8548133969306946}
intervention_fwp: {'add': 0.4946174919605255}
original_fwp: {'add': 0.2927361726760864}
intervention_fwp: {'add': 0.3998708128929138}
intervention_fwp: {'add': 0.6937190890312195}
intervention_fwp: {'insert': 0.7178293466567993}
intervention_fwp: {'add': 0.9430505037307739}
intervention_fwp: {'add': 0.7866805195808411}
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.6078202724456787}
original_fwp: {'register': 0.35210925340652466}
original_fwp: {'add': 0.5652760863304138}
intervention_fwp: {'add': 0.7212516069412231}
original_fwp: {'add': 0.20944097638130188, 'attrib': 0.4107857346534729}
intervention_fwp: {'set': 0.06500419974327087, 'add': 0.09644156694412231, 'attrib': 0.5390746593475342}
original_fwp: {'set': 0.048063959926366806, 'var': 0.0409863255918026, 'add': 0.18529963493347168, 'vars': 0.10536649823188782, 'declare': 0.06984733045101166}
intervention_fwp: {'set': 0.07075310498476028, 'add': 0.060769569128751755, 'delete': 0.052278440445661545, 'vars': 0.09327677637338638}
intervention_fwp: {'import': 0.05069311708211899, 'add': 0.04470239579677582, 'load': 0.10900483280420303, 'delete': 0.0629376471042633, 'students': 0.12635041773319244}
original_fwp: {'add': 0.4883575141429901}
intervention_fwp: {'add': 0.5815098881721497}
original_fwp: {'add': 0.49203410744667053}
original_fwp: {'add': 0.6661430597305298}
intervention_fwp: {'add': 0.4974740147590637}
intervention_fwp: {'pop': 0.5451957583427429}
intervention_fwp: {'push': 0.4093502461910248}
original_fwp: {'set': 0.08577985316514969, '__': 0.030568255111575127, 'author': 0.05369563400745392, 'update': 0.04246249049901962, 'props': 0.03156868368387222, 'children': 0.18673965334892273, '_': 0.05546651780605316}
intervention_fwp: {'set': 0.0952235609292984, '__': 0.017604243010282516, 'style': 0.014032221399247646, 'author': 0.13513876497745514, 'update': 0.038737110793590546, 'props': 0.03086625225841999, 'children': 0.1385388970375061, 'signature': 0.04521645978093147, '_': 0.035664524883031845, '[': 0.014566619880497456}
original_fwp: {'add': 0.8937824964523315}
intervention_fwp: {'add': 0.36451080441474915}
original_fwp: {'add': 0.4493906497955322}
intervention_fwp: {'add': 0.2823263108730316}
original_fwp: {'put': 0.4910889267921448}
intervention_fwp: {'put': 0.520919919013977}
original_fwp: {'add': 0.613438606262207}
intervention_fwp: {'add': 0.5124375820159912}
intervention_fwp: {'add': 0.3524300754070282}
original_fwp: {'add': 0.36074724793434143}
original_fwp: {'id': 0.010631892830133438, 'set': 0.017553117126226425, 'add': 0.025757430121302605, 'data': 0.05909264087677002, 'message': 0.010768468491733074, 'body': 0.1629609614610672, 'status': 0.14878800511360168, 'json': 0.03366817161440849, 'write': 0.024345003068447113, 'response': 0.011789822019636631, 'update': 0.12466180324554443, 'output': 0.010851684026420116, 'meta': 0.008382844738662243, 'success': 0.008169686421751976, 'headers': 0.07088669389486313, 'payload': 0.010109095834195614, 'metadata': 0.013183338567614555, 'outputs': 0.014460849575698376}
intervention_fwp: {'id': 0.008749539963901043, 'return': 0.008573177270591259, 'set': 0.020191757008433342, 'add': 0.026795484125614166, 'data': 0.0667305663228035, 'body': 0.21725580096244812, 'status': 0.05585425719618797, 'json': 0.02449006587266922, 'write': 0.019067933782935143, 'response': 0.009552037343382835, 'update': 0.19749096035957336, 'params': 0.008392232470214367, 'meta': 0.009784827008843422, 'success': 0.009826570749282837, 'headers': 0.067415252327919, 'payload': 0.014324435964226723, 'metadata': 0.023290978744626045, 'outputs': 0.008497054688632488}
intervention_fwp: {'add': 0.3104475438594818}
intervention_fwp: {'print': 0.14665015041828156, 'write': 0.5498785376548767}
intervention_fwp: {'write': 0.36411118507385254}
intervention_fwp: {'add': 0.4049190282821655}
------- c=remove ---------
------- c=append ---------
++++++++ model=4 ++++++++
======== label=remove ========
intervention_fwp: {'add': 0.5293076038360596}
intervention_fwp: {'insert': 0.6167139410972595}
intervention_fwp: {'add': 0.7486050724983215}
intervention_fwp: {'add': 0.5568966865539551}
intervention_fwp: {'add': 0.3501109778881073}
intervention_fwp: {'add': 0.9139577746391296}
intervention_fwp: {'add': 0.7401097416877747}
original_fwp: {'update': 0.3266609311103821, 'pop': 0.29059311747550964}
intervention_fwp: {'update': 0.5503045320510864, 'pop': 0.1283714771270752}
intervention_fwp: {'_': 0.25362905859947205}
original_fwp: {'pop': 0.3847061097621918}
intervention_fwp: {'add': 0.6414976119995117}
intervention_fwp: {'add': 0.4332803785800934}
original_fwp: {'pop': 0.3827062249183655}
intervention_fwp: {'add': 0.6777617931365967}
intervention_fwp: {'add': 0.5232357978820801}
intervention_fwp: {'insert': 0.7219967842102051}
intervention_fwp: {'add': 0.8214640021324158}
intervention_fwp: {'add': 0.947385311126709}
original_fwp: {'pop': 0.42640364170074463}
------- c=remove ---------
------- c=append ---------
======== label=append ========
original_fwp: {'add': 0.6510722041130066}
intervention_fwp: {'add': 0.4997943341732025}
intervention_fwp: {'pop': 0.5142274498939514}
original_fwp: {'add': 0.5121105909347534}
intervention_fwp: {'pop': 0.43264779448509216}
intervention_fwp: {'pop': 0.6271057724952698}
intervention_fwp: {'pop': 0.3893094062805176}
intervention_fwp: {'pop': 0.544145941734314}
original_fwp: {'add': 0.5015624165534973}
intervention_fwp: {'add': 0.482611745595932}
intervention_fwp: {'pop': 0.6061204075813293}
original_fwp: {'set': 0.0879531055688858, 'add': 0.2048744261264801}
intervention_fwp: {'set': 0.1852160394191742, 'add': 0.12969449162483215}
intervention_fwp: {'pop': 0.5231583714485168}
intervention_fwp: {'pop': 0.6684147119522095}
original_fwp: {'update': 0.40982210636138916}
intervention_fwp: {'update': 0.4493100643157959}
original_fwp: {'add': 0.4795832335948944}
intervention_fwp: {'pop': 0.5166886448860168}
original_fwp: {'add': 0.5794475674629211}
original_fwp: {'get': 0.030893541872501373, 'add': 0.48724302649497986, 'load': 0.031744856387376785, 'create': 0.06437919288873672, 'write': 0.04812558367848396, 'update': 0.04243218153715134, 'insert': 0.11154747009277344, 'delete': 0.015090061351656914}
intervention_fwp: {'delete': 0.34252724051475525}
intervention_fwp: {'update': 0.16929872334003448, 'pop': 0.38373515009880066}
intervention_fwp: {'pop': 0.3827062249183655}
original_fwp: {'put': 0.5909109711647034}
intervention_fwp: {'put': 0.40962687134742737}
intervention_fwp: {'update': 0.45472902059555054}
intervention_fwp: {'pop': 0.5046800374984741}
original_fwp: {'add': 0.6587096452713013}
intervention_fwp: {'add': 0.67823725938797}
------- c=remove ---------
------- c=append ---------
"""

# Define the words to exclude
exclude_words = {"remove", "append", "model", "dataset", "label","original_fwp", "intervention_fwp", 
                 "c","df_LI_ori_trct_no_comments_result", "df_LI_pure_trct_result", "df_LI_pure_trct_result"}

# Use regex to extract all words
pattern = r'\b\w+\b'
words = re.findall(pattern, data)

# Function to check if a word is a number
def is_number(word):
    try:
        float(word)  # Try to convert to a float
        return True
    except ValueError:
        return False

# Filter out excluded words and numbers
filtered_words = [word for word in words if word.lower() not in exclude_words and not is_number(word)]
word_counts = Counter(filtered_words)

# Get the most common word and the number of unique words
most_common_word = word_counts.most_common(1)
total_unique_words = len(word_counts)

# Display results
print(f"Most common word: {most_common_word}")
print(f"Total unique words: {total_unique_words}")
print(f"Top 10 most frequent words (excluding certain words): {word_counts.most_common(5)}")

Most common word: [('add', 214)]
Total unique words: 52
Top 10 most frequent words (excluding certain words): [('add', 214), ('pop', 38), ('insert', 21), ('update', 17), ('put', 12)]
