In [1]:
import pandas as pd
import os
import difflib
import subprocess

In [2]:
APPROACH = "spinfer"

In [3]:
def collect_cocci_files(path: str, approach: str)->list:
    outputs = []
    for root, dirnames, files in os.walk(path):
        for f in files:
            if f.endswith(f".{approach}.cocci"):
                # f = f.split(".")[:-1]
                # f = ".".join(f)
                outputs.append(
                    {
                        "root": root,
                        "cocci_filename": f
                    }
                )
    return pd.DataFrame(outputs)

In [4]:
PATH = "."
df = collect_cocci_files(PATH, APPROACH)

In [5]:
df.cocci_filename.value_counts()

cocci_filename
final_response_0.spinfer.cocci    14
Name: count, dtype: int64

In [6]:
def collect_c_files(path: str)->list:
    outputs = []
    for root, dirnames, files in os.walk(path):
        for f in files:
            if f.endswith(f".res.c.sanitized.res.c") and "gpt" not in f and "spinfer" not in f and "deepseek" not in f and "claude" not in f:
                f = f.split(".")[0]
                outputs.append(
                    {
                        "root": root,
                        "c_filename": f
                    }
                )
    return pd.DataFrame(outputs)

In [7]:
df_c = collect_c_files(PATH)
df_c.root.value_counts()

root
./kees_timer1             3
./EXP0-7                  2
./snd_soc                 2
./tcf_block_get-61        2
./dasd_smalloc            2
./sock_poll_wait-84       1
./ttm_bo_init-60          1
./random_ether_addr-84    1
./perf_evlist__mmap-69    1
./uartlite                1
./early_memunmap          1
./dma_pool_alloc-52       1
./free_bootmem-77         1
./tcaction                1
Name: count, dtype: int64

In [8]:
df_c.c_filename.to_list()

['1533533124_2018-08-06_4fd786e6c3d6_qgroup___btrfs_qgroup_release_data',
 '1533533124_2018-08-06_4fd786e6c3d6_disk-io___setup_root',
 '1507896057_2017-10-13_69d78ef25c7b_sch_drr_drr_init_qdisc',
 '1513791319_2017-12-20_8d1a77f974ca_sch_htb_htb_init',
 '1540294839_2018-10-23_89ab066d4229_tcp_tcp_poll',
 '1529689860_2018-06-22_6c1f0a1ffb7c_gemini_gemini_ethernet_port_probe',
 '1524629994_2018-04-25_e4b31b816c47_mt2701-afe-pcm_mt2701_dlm_fe_startup',
 '1524629994_2018-04-25_e4b31b816c47_mtk-afe-fe-dai_mtk_afe_fe_trigger',
 '1519311151_2018-02-22_724daa4fd65d_nouveau_bo_nouveau_bo_new',
 '1360605874_2013-02-11_6d53c3b71d32_uartlite_ulite_get_poll_char',
 '1512266437_2017-12-03_f74b9d3a1ac2_bpf_do_test',
 '1424769208_2015-02-24_8d4a40bc0651_setup_relocate_initrd',
 '1508184939_2017-10-16_b9eaf1872222_fas216_fas216_init',
 '1508184939_2017-10-16_b9eaf1872222_isdn_common_isdn_init',
 '1508184939_2017-10-16_b9eaf1872222_dasd_dasd_alloc_device',
 '1518632953_2018-02-14_ddd05979f89c_bdc_ep_ep_b

In [9]:
for i, row in df_c.sample(n=5, random_state=123).iterrows():
    print(row["root"], row["c_filename"])

./kees_timer1 1508184939_2017-10-16_b9eaf1872222_dasd_dasd_alloc_device
./random_ether_addr-84 1529689860_2018-06-22_6c1f0a1ffb7c_gemini_gemini_ethernet_port_probe
./sock_poll_wait-84 1540294839_2018-10-23_89ab066d4229_tcp_tcp_poll
./tcaction 1534706529_2018-08-19_244cd96adb5f_en_tc_parse_tc_fdb_actions
./ttm_bo_init-60 1519311151_2018-02-22_724daa4fd65d_nouveau_bo_nouveau_bo_new


In [10]:
for i, row in df.sample(n=5, random_state=123).iterrows():
    print(row["root"], row["cocci_filename"])

./perf_evlist__mmap-69 final_response_0.spinfer.cocci
./dma_pool_alloc-52 final_response_0.spinfer.cocci
./snd_soc final_response_0.spinfer.cocci
./EXP0-7 final_response_0.spinfer.cocci
./ttm_bo_init-60 final_response_0.spinfer.cocci


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   root            14 non-null     object
 1   cocci_filename  14 non-null     object
dtypes: object(2)
memory usage: 356.0+ bytes


In [12]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   root        20 non-null     object
 1   c_filename  20 non-null     object
dtypes: object(2)
memory usage: 452.0+ bytes


In [13]:
df = pd.merge(df, df_c, on='root', how='inner')
df.drop_duplicates(inplace=True)
df.shape

(20, 3)

In [14]:
df["pred"] = df["cocci_filename"].apply(lambda x: x.split(".")[0])
df["pred"] = df["pred"].apply(lambda x: x.split("_")[-1])
df.head(20)

Unnamed: 0,root,cocci_filename,c_filename,pred
0,./EXP0-7,final_response_0.spinfer.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup___bt...,0
1,./EXP0-7,final_response_0.spinfer.cocci,1533533124_2018-08-06_4fd786e6c3d6_disk-io___s...,0
2,./tcf_block_get-61,final_response_0.spinfer.cocci,1507896057_2017-10-13_69d78ef25c7b_sch_drr_drr...,0
3,./tcf_block_get-61,final_response_0.spinfer.cocci,1513791319_2017-12-20_8d1a77f974ca_sch_htb_htb...,0
4,./sock_poll_wait-84,final_response_0.spinfer.cocci,1540294839_2018-10-23_89ab066d4229_tcp_tcp_poll,0
5,./random_ether_addr-84,final_response_0.spinfer.cocci,1529689860_2018-06-22_6c1f0a1ffb7c_gemini_gemi...,0
6,./snd_soc,final_response_0.spinfer.cocci,1524629994_2018-04-25_e4b31b816c47_mt2701-afe-...,0
7,./snd_soc,final_response_0.spinfer.cocci,1524629994_2018-04-25_e4b31b816c47_mtk-afe-fe-...,0
8,./ttm_bo_init-60,final_response_0.spinfer.cocci,1519311151_2018-02-22_724daa4fd65d_nouveau_bo_...,0
9,./uartlite,final_response_0.spinfer.cocci,1360605874_2013-02-11_6d53c3b71d32_uartlite_ul...,0


# check syntax correctness

In [15]:
def check_syntax_correctness(filepath: str)->bool:
    '''
    Validates a Coccinelle semantic patch by parsing it, writes debug information, and returns a boolean indicating the validity.
    '''
    command = f'spatch --parse-cocci {filepath}'.split()

    try:
        result = subprocess.run(command, check=True, capture_output=True)
        return 1
    except Exception as e:
        print(e)
        return 0

In [18]:
df["is_cocci_valid"] = df.apply(lambda x: check_syntax_correctness(os.path.join(x.root, x.cocci_filename)), axis=1)

Command '['spatch', '--parse-cocci', './tcf_block_get-61/final_response_0.spinfer.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './tcf_block_get-61/final_response_0.spinfer.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './snd_soc/final_response_0.spinfer.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './snd_soc/final_response_0.spinfer.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './uartlite/final_response_0.spinfer.cocci']' returned non-zero exit status 255.


# check how many cocci is successfully applied

In [20]:
df["is_applied"] = df.apply(lambda x:os.path.exists(x.root + "/" + x.c_filename+f".{x.pred}.{APPROACH}.res.c.sanitized.res.c"), axis=1)
df.is_applied.value_counts()

is_applied
True    20
Name: count, dtype: int64

# compute precision and recall

In [21]:
def open_and_read_file_content(filepath: str)->str:
    """Open the input filepath, then read the content"""
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
    except IOError as e:
        print(f"An error occurred while reading the file: {e}")
    return ""

def remove_unmodified_lines(input_diff: list, is_merge: bool = True)->str:
    additions = []
    removals = []
    for line in input_diff:
        if line.startswith("+") and "+++" not in line:
            additions.append(line)
        elif line.startswith("-") and "---" not in line:
            removals.append(line)
    output_str = removals + additions
    if is_merge:
        return "\n".join(output_str)
    else:
        return removals + additions

def get_diff(file_a: str, file_b: str, n_context: int)->str:
    """Get the diff between file_a and file_b using difflib with n_context length"""
    file_a = open_and_read_file_content(file_a)
    if file_a == "" or "parse error :" in file_a or "init_defs_builtins : " in file_a:
        return ""
    file_b = open_and_read_file_content(file_b)
    if file_b == "" or "parse error :" in file_b or "init_defs_builtins : " in file_b:
        return ""

    lines_a = file_a.splitlines()
    lines_b = file_b.splitlines()        
    # Generate the unified diff
    diff = difflib.unified_diff(
        lines_a,
        lines_b,
        fromfile="initial",
        tofile="final",
        lineterm='',
        n=n_context
    )
    # diff = [x for x in diff if not x.startswith("+++") and not x.startswith("---") and not x.startswith("@@")]
    if n_context == 0:
        diff = remove_unmodified_lines(input_diff=diff, is_merge=False)
    # Convert the diff generator to a single string
    diff_text = '\n'.join(diff)
    return diff_text

In [29]:
df["diff_truth"] = df.apply(lambda x:get_diff(
    file_a = os.path.join(x.root, x.c_filename + ".c.sanitized.c"),
    file_b = os.path.join(x.root, x.c_filename + ".res.c.sanitized.res.c"),
    n_context = 0
), axis=1)

In [30]:
df

Unnamed: 0,root,cocci_filename,c_filename,pred,is_cocci_valid,is_applied,diff_truth,diff_pred,diff_5,cocci_fullpath
0,./EXP0-7,final_response_0.spinfer.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup___bt...,0,1,True,- btrfs_qgroup_free_refroot(BTRFS_I(inode)-...,- btrfs_qgroup_free_refroot(BTRFS_I(inode)-...,"--- initial\n+++ final\n@@ -13,11 +13,11 @@\n ...",./EXP0-7/final_response_0.spinfer.cocci
1,./EXP0-7,final_response_0.spinfer.cocci,1533533124_2018-08-06_4fd786e6c3d6_disk-io___s...,0,1,True,- root->objectid = objectid;\n+ root->root_k...,- root->objectid = objectid;\n+ root->root_k...,"--- initial\n+++ final\n@@ -3,11 +3,11 @@\n ...",./EXP0-7/final_response_0.spinfer.cocci
2,./tcf_block_get-61,final_response_0.spinfer.cocci,1507896057_2017-10-13_69d78ef25c7b_sch_drr_drr...,0,0,True,"- err = tcf_block_get(&q->block, &q->filter_l...",,,./tcf_block_get-61/final_response_0.spinfer.cocci
3,./tcf_block_get-61,final_response_0.spinfer.cocci,1513791319_2017-12-20_8d1a77f974ca_sch_htb_htb...,0,0,True,"- err = tcf_block_get(&q->block, &q->filter_l...",,,./tcf_block_get-61/final_response_0.spinfer.cocci
4,./sock_poll_wait-84,final_response_0.spinfer.cocci,1540294839_2018-10-23_89ab066d4229_tcp_tcp_poll,0,1,True,"- sock_poll_wait(file, wait);\n+ sock_poll_w...","- sock_poll_wait(file, wait);\n+ sock_poll_w...","--- initial\n+++ final\n@@ -2,11 +2,11 @@\n {\...",./sock_poll_wait-84/final_response_0.spinfer.c...
5,./random_ether_addr-84,final_response_0.spinfer.cocci,1529689860_2018-06-22_6c1f0a1ffb7c_gemini_gemi...,0,1,True,- random_ether_addr(netdev->dev_addr);\n+...,- random_ether_addr(netdev->dev_addr);\n+...,"--- initial\n+++ final\n@@ -104,11 +104,11 @@\...",./random_ether_addr-84/final_response_0.spinfe...
6,./snd_soc,final_response_0.spinfer.cocci,1524629994_2018-04-25_e4b31b816c47_mt2701-afe-...,0,0,True,- struct snd_soc_pcm_runtime *rtd = substream...,,,./snd_soc/final_response_0.spinfer.cocci
7,./snd_soc,final_response_0.spinfer.cocci,1524629994_2018-04-25_e4b31b816c47_mtk-afe-fe-...,0,0,True,- struct snd_soc_component *component = snd_s...,,,./snd_soc/final_response_0.spinfer.cocci
8,./ttm_bo_init-60,final_response_0.spinfer.cocci,1519311151_2018-02-22_724daa4fd65d_nouveau_bo_...,0,1,True,"- ret = ttm_bo_init(&drm->ttm.bdev, &nvbo->bo...","- ret = ttm_bo_init(&drm->ttm.bdev, &nvbo->bo...","--- initial\n+++ final\n@@ -82,11 +82,11 @@\n ...",./ttm_bo_init-60/final_response_0.spinfer.cocci
9,./uartlite,final_response_0.spinfer.cocci,1360605874_2013-02-11_6d53c3b71d32_uartlite_ul...,0,0,True,- if (!(ioread32be(port->membase + ULITE_STAT...,,,./uartlite/final_response_0.spinfer.cocci


In [31]:
df["diff_pred"] = df.apply(lambda x:get_diff(
    file_a = os.path.join(x.root, x.c_filename + ".c.sanitized.c"),
    file_b = os.path.join(x.root, x.c_filename + f".{x.pred}.{APPROACH}.res.c.sanitized.res.c"),
    n_context = 0
), axis=1)

In [32]:
df["diff_5"] = df.apply(lambda x:get_diff(
    file_a = os.path.join(x.root, x.c_filename + ".c.sanitized.c"),
    file_b = os.path.join(x.root, x.c_filename + f".{x.pred}.{APPROACH}.res.c.sanitized.res.c"),
    n_context = 5
), axis=1)

In [33]:
df

Unnamed: 0,root,cocci_filename,c_filename,pred,is_cocci_valid,is_applied,diff_truth,diff_pred,diff_5,cocci_fullpath
0,./EXP0-7,final_response_0.spinfer.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup___bt...,0,1,True,- btrfs_qgroup_free_refroot(BTRFS_I(inode)-...,- btrfs_qgroup_free_refroot(BTRFS_I(inode)-...,"--- initial\n+++ final\n@@ -13,11 +13,11 @@\n ...",./EXP0-7/final_response_0.spinfer.cocci
1,./EXP0-7,final_response_0.spinfer.cocci,1533533124_2018-08-06_4fd786e6c3d6_disk-io___s...,0,1,True,- root->objectid = objectid;\n+ root->root_k...,- root->objectid = objectid;\n+ root->root_k...,"--- initial\n+++ final\n@@ -3,11 +3,11 @@\n ...",./EXP0-7/final_response_0.spinfer.cocci
2,./tcf_block_get-61,final_response_0.spinfer.cocci,1507896057_2017-10-13_69d78ef25c7b_sch_drr_drr...,0,0,True,"- err = tcf_block_get(&q->block, &q->filter_l...",,,./tcf_block_get-61/final_response_0.spinfer.cocci
3,./tcf_block_get-61,final_response_0.spinfer.cocci,1513791319_2017-12-20_8d1a77f974ca_sch_htb_htb...,0,0,True,"- err = tcf_block_get(&q->block, &q->filter_l...",,,./tcf_block_get-61/final_response_0.spinfer.cocci
4,./sock_poll_wait-84,final_response_0.spinfer.cocci,1540294839_2018-10-23_89ab066d4229_tcp_tcp_poll,0,1,True,"- sock_poll_wait(file, wait);\n+ sock_poll_w...","- sock_poll_wait(file, wait);\n+ sock_poll_w...","--- initial\n+++ final\n@@ -2,11 +2,11 @@\n {\...",./sock_poll_wait-84/final_response_0.spinfer.c...
5,./random_ether_addr-84,final_response_0.spinfer.cocci,1529689860_2018-06-22_6c1f0a1ffb7c_gemini_gemi...,0,1,True,- random_ether_addr(netdev->dev_addr);\n+...,- random_ether_addr(netdev->dev_addr);\n+...,"--- initial\n+++ final\n@@ -104,11 +104,11 @@\...",./random_ether_addr-84/final_response_0.spinfe...
6,./snd_soc,final_response_0.spinfer.cocci,1524629994_2018-04-25_e4b31b816c47_mt2701-afe-...,0,0,True,- struct snd_soc_pcm_runtime *rtd = substream...,,,./snd_soc/final_response_0.spinfer.cocci
7,./snd_soc,final_response_0.spinfer.cocci,1524629994_2018-04-25_e4b31b816c47_mtk-afe-fe-...,0,0,True,- struct snd_soc_component *component = snd_s...,,,./snd_soc/final_response_0.spinfer.cocci
8,./ttm_bo_init-60,final_response_0.spinfer.cocci,1519311151_2018-02-22_724daa4fd65d_nouveau_bo_...,0,1,True,"- ret = ttm_bo_init(&drm->ttm.bdev, &nvbo->bo...","- ret = ttm_bo_init(&drm->ttm.bdev, &nvbo->bo...","--- initial\n+++ final\n@@ -82,11 +82,11 @@\n ...",./ttm_bo_init-60/final_response_0.spinfer.cocci
9,./uartlite,final_response_0.spinfer.cocci,1360605874_2013-02-11_6d53c3b71d32_uartlite_ul...,0,0,True,- if (!(ioread32be(port->membase + ULITE_STAT...,,,./uartlite/final_response_0.spinfer.cocci


In [34]:
def compute_precision(truths: list, results: list)->int:
    '''
    Computes the precision of the results compared to the truths, rounding the result to three decimal places.
    '''
    truths_cp = truths.copy()
    is_match = []
    for line in results:
        if line in truths_cp:
            truths_cp.remove(line)
            is_match.append(1)
        else:
            is_match.append(0)
    precision = sum(is_match)/len(is_match) if len(is_match) != 0 else 0
    return round(precision, 3)

In [35]:
df["precision"] = df.apply(lambda x: compute_precision(
    x.diff_truth.splitlines(),
    x.diff_pred.splitlines()
), axis=1)

In [36]:
def compute_recall(truths: list, results: list)->int:
    '''
    Computes the recall of the results compared to the truths, rounding the result to three decimal places.
    '''
    results_cp = results.copy()
    is_match = []
    for line in truths:
        if line in results_cp:
            results_cp.remove(line)
            is_match.append(1)
        else:
            is_match.append(0)
    recall = sum(is_match)/len(is_match) if len(is_match) != 0 else 0
    return round(recall, 3)

In [37]:
df["recall"] = df.apply(lambda x: compute_recall(
    x.diff_truth.splitlines(),
    x.diff_pred.splitlines()
), axis=1)

In [38]:
df["cocci_fullpath"] = df.apply(lambda x: os.path.join(x.root, x.cocci_filename), axis=1)

In [39]:
df.groupby(["root", "cocci_filename", "cocci_fullpath"])[["precision", "recall" ,"is_cocci_valid"]].mean().round(2).reset_index()

Unnamed: 0,root,cocci_filename,cocci_fullpath,precision,recall,is_cocci_valid
0,./EXP0-7,final_response_0.spinfer.cocci,./EXP0-7/final_response_0.spinfer.cocci,1.0,1.0,1.0
1,./dasd_smalloc,final_response_0.spinfer.cocci,./dasd_smalloc/final_response_0.spinfer.cocci,0.75,0.75,1.0
2,./dma_pool_alloc-52,final_response_0.spinfer.cocci,./dma_pool_alloc-52/final_response_0.spinfer.c...,1.0,1.0,1.0
3,./early_memunmap,final_response_0.spinfer.cocci,./early_memunmap/final_response_0.spinfer.cocci,1.0,1.0,1.0
4,./free_bootmem-77,final_response_0.spinfer.cocci,./free_bootmem-77/final_response_0.spinfer.cocci,1.0,1.0,1.0
5,./kees_timer1,final_response_0.spinfer.cocci,./kees_timer1/final_response_0.spinfer.cocci,1.0,1.0,1.0
6,./perf_evlist__mmap-69,final_response_0.spinfer.cocci,./perf_evlist__mmap-69/final_response_0.spinfe...,1.0,1.0,1.0
7,./random_ether_addr-84,final_response_0.spinfer.cocci,./random_ether_addr-84/final_response_0.spinfe...,1.0,1.0,1.0
8,./snd_soc,final_response_0.spinfer.cocci,./snd_soc/final_response_0.spinfer.cocci,0.0,0.0,0.0
9,./sock_poll_wait-84,final_response_0.spinfer.cocci,./sock_poll_wait-84/final_response_0.spinfer.c...,1.0,1.0,1.0


In [40]:
temp = df.groupby(["root", "cocci_filename", "cocci_fullpath"])[["precision", "recall" ,"is_cocci_valid"]].mean().round(2).reset_index()
for index, row in temp.iterrows():
    # Combine root path and filename
    file_path = os.path.join(row['root'], row['cocci_filename'])
    
    # Try to open and print the file contents
    try:
        with open(file_path, 'r') as file:
            print(f"\n--- Contents of file: {file_path} ---")
            print(file.read())
            print("--- End of file ---\n")
    except Exception as e:
        print(f"Error opening file {file_path}: {e}")


--- Contents of file: ./EXP0-7/final_response_0.spinfer.cocci ---
@@
expression E0, E1;
@@
- E0->objectid = E1; 
+ E0->root_key.objectid = E1; 
// Infered from: (./EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_disk-io___setup_root.{c.sanitized.c,res.c.sanitized.res.c}: __setup_root)
// Recall: 0.50, Precision: 1.00, Matching recall: 1.00

// ---------------------------------------------
@@
expression E0, E1;
@@
- btrfs_qgroup_free_refroot(BTRFS_I(E0)->root->fs_info,                           BTRFS_I(E0)->root->objectid, E1.bytes_changed,                           BTRFS_QGROUP_RSV_DATA); 
+ btrfs_qgroup_free_refroot(BTRFS_I(E0)->root->fs_info,                           BTRFS_I(E0)->root->root_key.objectid,                           E1.bytes_changed, BTRFS_QGROUP_RSV_DATA); 
// Infered from: (./EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_qgroup___btrfs_qgroup_release_data.{c.sanitized.c,res.c.sanitized.res.c}: __btrfs_qgroup_release_data)
// Recall: 0.50, Precision: 1.00, Matching recall: 1.0

# check final example coverage

- Here, we only care about the precision and recall
    - If tie, then choose the highest one


- the selection of the best patch in the previous step guarantee that the patch will also yield similar result as the choosing the highest precision and recall 
    - note that we also use precision and recall as one of the criterion when selecting the best patch
    - but, the selection process is more refined (i check the overfitting issue)

In [41]:
df.c_filename.nunique()

20

In [42]:
df['rank_score'] = (df['precision'] + df['recall'] + df["is_cocci_valid"]) / 3
best_preds_per_cluster = df.loc[df.groupby(['c_filename'])['rank_score'].idxmax()]
best_clusters = best_preds_per_cluster.loc[
    best_preds_per_cluster.groupby('c_filename')['rank_score'].idxmax()
]
best_clusters = best_clusters.drop(columns=['rank_score'])

In [43]:
best_clusters.shape

(20, 12)

In [44]:
best_clusters

Unnamed: 0,root,cocci_filename,c_filename,pred,is_cocci_valid,is_applied,diff_truth,diff_pred,diff_5,cocci_fullpath,precision,recall
9,./uartlite,final_response_0.spinfer.cocci,1360605874_2013-02-11_6d53c3b71d32_uartlite_ul...,0,0,True,- if (!(ioread32be(port->membase + ULITE_STAT...,,,./uartlite/final_response_0.spinfer.cocci,0.0,0.0
11,./early_memunmap,final_response_0.spinfer.cocci,1424769208_2015-02-24_8d4a40bc0651_setup_reloc...,0,1,True,"- early_iounmap(p, clen + slop);\n+ ...","- early_iounmap(p, clen + slop);\n+ ...","--- initial\n+++ final\n@@ -24,11 +24,11 @@\n ...",./early_memunmap/final_response_0.spinfer.cocci,1.0,1.0
2,./tcf_block_get-61,final_response_0.spinfer.cocci,1507896057_2017-10-13_69d78ef25c7b_sch_drr_drr...,0,0,True,"- err = tcf_block_get(&q->block, &q->filter_l...",,,./tcf_block_get-61/final_response_0.spinfer.cocci,0.0,0.0
14,./kees_timer1,final_response_0.spinfer.cocci,1508184939_2017-10-16_b9eaf1872222_dasd_dasd_a...,0,1,True,- init_timer(&device->timer);\n- device->tim...,- init_timer(&device->timer);\n- device->tim...,"--- initial\n+++ final\n@@ -23,13 +23,11 @@\n ...",./kees_timer1/final_response_0.spinfer.cocci,1.0,1.0
12,./kees_timer1,final_response_0.spinfer.cocci,1508184939_2017-10-16_b9eaf1872222_fas216_fas2...,0,1,True,- init_timer(&info->eh_timer);\n- info->eh_t...,- init_timer(&info->eh_timer);\n- info->eh_t...,"--- initial\n+++ final\n@@ -9,13 +9,11 @@\n ...",./kees_timer1/final_response_0.spinfer.cocci,1.0,1.0
13,./kees_timer1,final_response_0.spinfer.cocci,1508184939_2017-10-16_b9eaf1872222_isdn_common...,0,1,True,- init_timer(&dev->timer);\n- dev->timer.fun...,- init_timer(&dev->timer);\n- dev->timer.fun...,"--- initial\n+++ final\n@@ -6,12 +6,11 @@\n ...",./kees_timer1/final_response_0.spinfer.cocci,1.0,1.0
10,./perf_evlist__mmap-69,final_response_0.spinfer.cocci,1512266437_2017-12-03_f74b9d3a1ac2_bpf_do_test,0,1,True,"- err = perf_evlist__mmap(evlist, opts.mmap_p...","- err = perf_evlist__mmap(evlist, opts.mmap_p...","--- initial\n+++ final\n@@ -49,11 +49,11 @@\n ...",./perf_evlist__mmap-69/final_response_0.spinfe...,1.0,1.0
3,./tcf_block_get-61,final_response_0.spinfer.cocci,1513791319_2017-12-20_8d1a77f974ca_sch_htb_htb...,0,0,True,"- err = tcf_block_get(&q->block, &q->filter_l...",,,./tcf_block_get-61/final_response_0.spinfer.cocci,0.0,0.0
15,./dma_pool_alloc-52,final_response_0.spinfer.cocci,1518632953_2018-02-14_ddd05979f89c_bdc_ep_ep_b...,0,1,True,- bd_table->start_bd = dma_pool_alloc(bdc...,- bd_table->start_bd = dma_pool_alloc(bdc...,"--- initial\n+++ final\n@@ -21,20 +21,19 @@\n ...",./dma_pool_alloc-52/final_response_0.spinfer.c...,1.0,1.0
8,./ttm_bo_init-60,final_response_0.spinfer.cocci,1519311151_2018-02-22_724daa4fd65d_nouveau_bo_...,0,1,True,"- ret = ttm_bo_init(&drm->ttm.bdev, &nvbo->bo...","- ret = ttm_bo_init(&drm->ttm.bdev, &nvbo->bo...","--- initial\n+++ final\n@@ -82,11 +82,11 @@\n ...",./ttm_bo_init-60/final_response_0.spinfer.cocci,1.0,1.0


In [50]:
round(best_clusters.precision.mean(), 2), round(best_clusters.recall.mean(), 2), round(best_clusters.drop_duplicates(subset=["root"]).is_cocci_valid.mean(), 2)

(np.float64(0.72), np.float64(0.72), np.float64(0.79))

In [51]:
best_clusters.drop_duplicates(subset=["root"]).shape

(14, 12)

In [52]:
mean_df = best_clusters.groupby(["root"])[["is_cocci_valid", "precision", "recall"]].mean().round(2)
mean_df

Unnamed: 0_level_0,is_cocci_valid,precision,recall
root,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
./EXP0-7,1.0,1.0,1.0
./dasd_smalloc,1.0,0.75,0.75
./dma_pool_alloc-52,1.0,1.0,1.0
./early_memunmap,1.0,1.0,1.0
./free_bootmem-77,1.0,1.0,1.0
./kees_timer1,1.0,1.0,1.0
./perf_evlist__mmap-69,1.0,1.0,1.0
./random_ether_addr-84,1.0,1.0,1.0
./snd_soc,0.0,0.0,0.0
./sock_poll_wait-84,1.0,1.0,1.0


In [53]:
# Step 1: Group the data by 'root' and 'cocci_filename'
grouped = best_clusters.groupby(['root'])

# Step 2: Compute total samples per group
group_counts = grouped.size().reset_index(name='num_samples')
temp_df = best_clusters.merge(group_counts, on=['root'])

total_samples = temp_df['num_samples'].sum()
weighted_precision = (temp_df['precision'] * temp_df['num_samples']).sum() / total_samples
weighted_recall = (temp_df['recall'] * temp_df['num_samples']).sum() / total_samples


weighted_precision, weighted_recall

(np.float64(0.7058823529411765), np.float64(0.7058823529411765))

In [54]:
mean_df.reset_index(inplace=True)
mean_df["root"] = mean_df["root"].apply(lambda x: x.split("/")[-1])
mean_df.to_csv("result_perdir_claude.csv", index=False)