In [1]:
import pandas as pd
import os
import difflib
import subprocess

In [2]:
APPROACH = "claude"

In [3]:
def collect_cocci_files(path: str, approach: str)->list:
    outputs = []
    for root, dirnames, files in os.walk(path):
        for f in files:
            if f.endswith(f".{approach}.cocci"):
                # f = f.split(".")[:-1]
                # f = ".".join(f)
                outputs.append(
                    {
                        "root": root,
                        "cocci_filename": f
                    }
                )
    return pd.DataFrame(outputs)

In [4]:
PATH = "."
df = collect_cocci_files(PATH, APPROACH)

In [5]:
df.cocci_filename.value_counts()

cocci_filename
final_response_2.claude.cocci    14
final_response_0.claude.cocci    14
final_response_1.claude.cocci    14
Name: count, dtype: int64

In [6]:
def collect_c_files(path: str)->list:
    outputs = []
    for root, dirnames, files in os.walk(path):
        for f in files:
            if f.endswith(f".res.c.sanitized.res.c") and "gpt" not in f and "spinfer" not in f and "deepseek" not in f and "claude" not in f:
                f = f.split(".")[0]
                outputs.append(
                    {
                        "root": root,
                        "c_filename": f
                    }
                )
    return pd.DataFrame(outputs)

In [7]:
df_c = collect_c_files(PATH)
df_c.root.value_counts()

root
./kees_timer1             9
./EXP0-7                  6
./snd_soc                 6
./tcf_block_get-61        6
./dasd_smalloc            6
./sock_poll_wait-84       3
./ttm_bo_init-60          3
./random_ether_addr-84    3
./perf_evlist__mmap-69    3
./uartlite                3
./early_memunmap          3
./dma_pool_alloc-52       3
./free_bootmem-77         3
./tcaction                3
Name: count, dtype: int64

In [8]:
df_c.c_filename.to_list()

['1533533124_2018-08-06_4fd786e6c3d6_qgroup___btrfs_qgroup_release_data',
 '1533533124_2018-08-06_4fd786e6c3d6_backref_btrfs_check_shared',
 '1533533124_2018-08-06_4fd786e6c3d6_export_btrfs_encode_fh',
 '1533533124_2018-08-06_4fd786e6c3d6_transaction_btrfs_clean_one_deleted_snapshot',
 '1533533124_2018-08-06_4fd786e6c3d6_qgroup_btrfs_qgroup_check_reserved_leak',
 '1533533124_2018-08-06_4fd786e6c3d6_disk-io___setup_root',
 '1507896057_2017-10-13_69d78ef25c7b_sch_atm_atm_tc_change',
 '1507896057_2017-10-13_69d78ef25c7b_sch_drr_drr_init_qdisc',
 '1513791319_2017-12-20_8d1a77f974ca_sch_htb_htb_init',
 '1513791319_2017-12-20_8d1a77f974ca_sch_sfq_sfq_init',
 '1507896057_2017-10-13_69d78ef25c7b_sch_sfb_sfb_init',
 '1513791319_2017-12-20_8d1a77f974ca_sch_drr_drr_init_qdisc',
 '1540294839_2018-10-23_89ab066d4229_llcp_sock_llcp_sock_poll',
 '1540294839_2018-10-23_89ab066d4229_tcp_tcp_poll',
 '1540294839_2018-10-23_89ab066d4229_af_rxrpc_rxrpc_poll',
 '1529689860_2018-06-22_6c1f0a1ffb7c_rmnet_vnd_

In [9]:
for i, row in df_c.sample(n=5, random_state=123).iterrows():
    print(row["root"], row["c_filename"])

./ttm_bo_init-60 1519311151_2018-02-22_724daa4fd65d_qxl_object_qxl_bo_create
./kees_timer1 1508184939_2017-10-16_b9eaf1872222_isdn_common_isdn_init
./random_ether_addr-84 1529689860_2018-06-22_6c1f0a1ffb7c_rmnet_vnd_rmnet_vnd_setup
./perf_evlist__mmap-69 1512266437_2017-12-03_f74b9d3a1ac2_bpf_do_test
./kees_timer1 1508184939_2017-10-16_b9eaf1872222_n2100_n2100_request_gpios


In [10]:
for i, row in df.sample(n=5, random_state=123).iterrows():
    print(row["root"], row["cocci_filename"])

./kees_timer1 final_response_2.claude.cocci
./snd_soc final_response_2.claude.cocci
./random_ether_addr-84 final_response_1.claude.cocci
./free_bootmem-77 final_response_1.claude.cocci
./random_ether_addr-84 final_response_0.claude.cocci


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   root            42 non-null     object
 1   cocci_filename  42 non-null     object
dtypes: object(2)
memory usage: 804.0+ bytes


In [12]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   root        60 non-null     object
 1   c_filename  60 non-null     object
dtypes: object(2)
memory usage: 1.1+ KB


In [13]:
df = pd.merge(df, df_c, on='root', how='inner')
df.drop_duplicates(inplace=True)
df.shape

(180, 3)

In [14]:
df["pred"] = df["cocci_filename"].apply(lambda x: x.split(".")[0])
df["pred"] = df["pred"].apply(lambda x: x.split("_")[-1])
df.head(20)

Unnamed: 0,root,cocci_filename,c_filename,pred
0,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup___bt...,2
1,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_backref_btr...,2
2,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_export_btrf...,2
3,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_transaction...,2
4,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup_btrf...,2
5,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_disk-io___s...,2
6,./EXP0-7,final_response_0.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup___bt...,0
7,./EXP0-7,final_response_0.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_backref_btr...,0
8,./EXP0-7,final_response_0.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_export_btrf...,0
9,./EXP0-7,final_response_0.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_transaction...,0


# check syntax correctness

In [15]:
def check_syntax_correctness(filepath: str)->bool:
    '''
    Validates a Coccinelle semantic patch by parsing it, writes debug information, and returns a boolean indicating the validity.
    '''
    command = f'spatch --parse-cocci {filepath}'.split()

    try:
        result = subprocess.run(command, check=True, capture_output=True)
        return 1
    except Exception as e:
        print(e)
        return 0

In [16]:
df["is_cocci_valid"] = df.apply(lambda x: check_syntax_correctness(os.path.join(x.root, x.cocci_filename)), axis=1)

Command '['spatch', '--parse-cocci', './EXP0-7/final_response_1.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './EXP0-7/final_response_1.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './EXP0-7/final_response_1.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './EXP0-7/final_response_1.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './EXP0-7/final_response_1.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './EXP0-7/final_response_1.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './tcf_block_get-61/final_response_0.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './tcf_block_get-61/final_response_0.claude.cocci']' returned non-zero exit status 255.
Command '['spatch', '--parse-cocci', './tcf_block_get-61/fin

# check how many cocci is successfully applied

In [17]:
df["is_applied"] = df.apply(lambda x:os.path.exists(x.root + "/" + x.c_filename+f".{x.pred}.{APPROACH}.res.c.sanitized.res.c"), axis=1)
df.is_applied.value_counts()

is_applied
False    93
True     87
Name: count, dtype: int64

# compute precision and recall

In [18]:
def open_and_read_file_content(filepath: str)->str:
    """Open the input filepath, then read the content"""
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
    except IOError as e:
        print(f"An error occurred while reading the file: {e}")
    return ""

def remove_unmodified_lines(input_diff: list, is_merge: bool = True)->str:
    additions = []
    removals = []
    for line in input_diff:
        if line.startswith("+") and "+++" not in line:
            additions.append(line)
        elif line.startswith("-") and "---" not in line:
            removals.append(line)
    output_str = removals + additions
    if is_merge:
        return "\n".join(output_str)
    else:
        return removals + additions

def get_diff(file_a: str, file_b: str, n_context: int)->str:
    """Get the diff between file_a and file_b using difflib with n_context length"""
    file_a = open_and_read_file_content(file_a)
    if file_a == "" or "parse error :" in file_a or "init_defs_builtins : " in file_a:
        return ""
    file_b = open_and_read_file_content(file_b)
    if file_b == "" or "parse error :" in file_b or "init_defs_builtins : " in file_b:
        return ""

    lines_a = file_a.splitlines()
    lines_b = file_b.splitlines()        
    # Generate the unified diff
    diff = difflib.unified_diff(
        lines_a,
        lines_b,
        fromfile="initial",
        tofile="final",
        lineterm='',
        n=n_context
    )
    # diff = [x for x in diff if not x.startswith("+++") and not x.startswith("---") and not x.startswith("@@")]
    if n_context == 0:
        diff = remove_unmodified_lines(input_diff=diff, is_merge=False)
    # Convert the diff generator to a single string
    diff_text = '\n'.join(diff)
    return diff_text

In [19]:
df["diff_truth"] = df.apply(lambda x:get_diff(
    file_a = os.path.join(x.root, x.c_filename + ".c.sanitized.c"),
    file_b = os.path.join(x.root, x.c_filename + ".res.c.sanitized.res.c"),
    n_context = 0
), axis=1)

In [20]:
df

Unnamed: 0,root,cocci_filename,c_filename,pred,is_cocci_valid,is_applied,diff_truth
0,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup___bt...,2,1,True,- btrfs_qgroup_free_refroot(BTRFS_I(inode)-...
1,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_backref_btr...,2,1,True,"- .root_objectid = root->objectid,\n+ ..."
2,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_export_btrf...,2,1,True,- fid->root_objectid = BTRFS_I(inode)->root->...
3,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_transaction...,2,1,True,"- btrfs_debug(fs_info, ""cleaner removing %llu..."
4,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup_btrf...,2,1,True,- btrfs_qgroup_free_refroot(BTRFS_I(inode...
...,...,...,...,...,...,...,...
175,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_fba_da...,1,0,False,"- cqr = dasd_smalloc_request(DASD_FBA_MAGIC, ..."
176,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_eckd_d...,1,0,False,"- cqr = dasd_smalloc_request(DASD_ECKD_MAGIC,..."
177,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_fba_da...,1,0,False,"- cqr = dasd_smalloc_request(DASD_FBA_MAGIC, ..."
178,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_eckd_d...,1,0,False,"- cqr = dasd_smalloc_request(DASD_ECKD_MAGIC,..."


In [21]:
df["diff_pred"] = df.apply(lambda x:get_diff(
    file_a = os.path.join(x.root, x.c_filename + ".c.sanitized.c"),
    file_b = os.path.join(x.root, x.c_filename + f".{x.pred}.{APPROACH}.res.c.sanitized.res.c"),
    n_context = 0
), axis=1)

Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_qgroup___btrfs_qgroup_release_data.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_backref_btrfs_check_shared.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_export_btrfs_encode_fh.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_transaction_btrfs_clean_one_deleted_snapshot.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_qgroup_btrfs_qgroup_check_reserved_leak.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_disk-io___setup_root.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './tcf_block_get-61/1507896057_2017-10-13_69d78ef25c7b_sch_atm_atm_tc_change.0.claude.res.c.sanitized.res.c' was not found.
Error: The file './tc

In [22]:
df["diff_5"] = df.apply(lambda x:get_diff(
    file_a = os.path.join(x.root, x.c_filename + ".c.sanitized.c"),
    file_b = os.path.join(x.root, x.c_filename + f".{x.pred}.{APPROACH}.res.c.sanitized.res.c"),
    n_context = 5
), axis=1)

Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_qgroup___btrfs_qgroup_release_data.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_backref_btrfs_check_shared.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_export_btrfs_encode_fh.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_transaction_btrfs_clean_one_deleted_snapshot.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_qgroup_btrfs_qgroup_check_reserved_leak.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './EXP0-7/1533533124_2018-08-06_4fd786e6c3d6_disk-io___setup_root.1.claude.res.c.sanitized.res.c' was not found.
Error: The file './tcf_block_get-61/1507896057_2017-10-13_69d78ef25c7b_sch_atm_atm_tc_change.0.claude.res.c.sanitized.res.c' was not found.
Error: The file './tc

In [23]:
df

Unnamed: 0,root,cocci_filename,c_filename,pred,is_cocci_valid,is_applied,diff_truth,diff_pred,diff_5
0,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup___bt...,2,1,True,- btrfs_qgroup_free_refroot(BTRFS_I(inode)-...,- btrfs_qgroup_free_refroot(BTRFS_I(inode)-...,"--- initial\n+++ final\n@@ -13,11 +13,11 @@\n ..."
1,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_backref_btr...,2,1,True,"- .root_objectid = root->objectid,\n+ ...","- .root_objectid = root->objectid,\n+ ...","--- initial\n+++ final\n@@ -7,11 +7,11 @@\n ..."
2,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_export_btrf...,2,1,True,- fid->root_objectid = BTRFS_I(inode)->root->...,- fid->objectid = btrfs_ino(BTRFS_I(inode));\...,"--- initial\n+++ final\n@@ -13,19 +13,19 @@\n ..."
3,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_transaction...,2,1,True,"- btrfs_debug(fs_info, ""cleaner removing %llu...","- btrfs_debug(fs_info, ""cleaner removing %llu...","--- initial\n+++ final\n@@ -9,11 +9,11 @@\n ..."
4,./EXP0-7,final_response_2.claude.cocci,1533533124_2018-08-06_4fd786e6c3d6_qgroup_btrf...,2,1,True,- btrfs_qgroup_free_refroot(BTRFS_I(inode...,- btrfs_qgroup_free_refroot(BTRFS_I(inode...,"--- initial\n+++ final\n@@ -12,9 +12,9 @@\n ..."
...,...,...,...,...,...,...,...,...,...
175,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_fba_da...,1,0,False,"- cqr = dasd_smalloc_request(DASD_FBA_MAGIC, ...",,
176,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_eckd_d...,1,0,False,"- cqr = dasd_smalloc_request(DASD_ECKD_MAGIC,...",,
177,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_fba_da...,1,0,False,"- cqr = dasd_smalloc_request(DASD_FBA_MAGIC, ...",,
178,./dasd_smalloc,final_response_1.claude.cocci,1528132059_2018-06-04_c5205f2ff2be_dasd_eckd_d...,1,0,False,"- cqr = dasd_smalloc_request(DASD_ECKD_MAGIC,...",,


In [24]:
def compute_precision(truths: list, results: list)->int:
    '''
    Computes the precision of the results compared to the truths, rounding the result to three decimal places.
    '''
    truths_cp = truths.copy()
    is_match = []
    for line in results:
        if line in truths_cp:
            truths_cp.remove(line)
            is_match.append(1)
        else:
            is_match.append(0)
    precision = sum(is_match)/len(is_match) if len(is_match) != 0 else 0
    return round(precision, 3)

In [25]:
df["precision"] = df.apply(lambda x: compute_precision(
    x.diff_truth.splitlines(),
    x.diff_pred.splitlines()
), axis=1)

In [26]:
def compute_recall(truths: list, results: list)->int:
    '''
    Computes the recall of the results compared to the truths, rounding the result to three decimal places.
    '''
    results_cp = results.copy()
    is_match = []
    for line in truths:
        if line in results_cp:
            results_cp.remove(line)
            is_match.append(1)
        else:
            is_match.append(0)
    recall = sum(is_match)/len(is_match) if len(is_match) != 0 else 0
    return round(recall, 3)

In [27]:
df["recall"] = df.apply(lambda x: compute_recall(
    x.diff_truth.splitlines(),
    x.diff_pred.splitlines()
), axis=1)

In [28]:
df["cocci_fullpath"] = df.apply(lambda x: os.path.join(x.root, x.cocci_filename), axis=1)

In [29]:
df.groupby(["root", "cocci_filename", "cocci_fullpath"])[["precision", "recall" ,"is_cocci_valid"]].mean().round(2).reset_index()

Unnamed: 0,root,cocci_filename,cocci_fullpath,precision,recall,is_cocci_valid
0,./EXP0-7,final_response_0.claude.cocci,./EXP0-7/final_response_0.claude.cocci,0.42,0.5,1.0
1,./EXP0-7,final_response_1.claude.cocci,./EXP0-7/final_response_1.claude.cocci,0.0,0.0,0.0
2,./EXP0-7,final_response_2.claude.cocci,./EXP0-7/final_response_2.claude.cocci,0.94,1.0,1.0
3,./dasd_smalloc,final_response_0.claude.cocci,./dasd_smalloc/final_response_0.claude.cocci,0.0,0.0,0.0
4,./dasd_smalloc,final_response_1.claude.cocci,./dasd_smalloc/final_response_1.claude.cocci,0.0,0.0,0.0
5,./dasd_smalloc,final_response_2.claude.cocci,./dasd_smalloc/final_response_2.claude.cocci,0.0,0.0,0.0
6,./dma_pool_alloc-52,final_response_0.claude.cocci,./dma_pool_alloc-52/final_response_0.claude.cocci,1.0,1.0,1.0
7,./dma_pool_alloc-52,final_response_1.claude.cocci,./dma_pool_alloc-52/final_response_1.claude.cocci,0.67,0.67,1.0
8,./dma_pool_alloc-52,final_response_2.claude.cocci,./dma_pool_alloc-52/final_response_2.claude.cocci,0.0,0.0,0.0
9,./early_memunmap,final_response_0.claude.cocci,./early_memunmap/final_response_0.claude.cocci,1.0,1.0,1.0


In [30]:
temp = df.groupby(["root", "cocci_filename", "cocci_fullpath"])[["precision", "recall" ,"is_cocci_valid"]].mean().round(2).reset_index()
for index, row in temp.iterrows():
    # Combine root path and filename
    file_path = os.path.join(row['root'], row['cocci_filename'])
    
    # Try to open and print the file contents
    try:
        with open(file_path, 'r') as file:
            print(f"\n--- Contents of file: {file_path} ---")
            print(file.read())
            print("--- End of file ---\n")
    except Exception as e:
        print(f"Error opening file {file_path}: {e}")


--- Contents of file: ./EXP0-7/final_response_0.claude.cocci ---
@@ expression E; @@
- E->objectid
+ E->root_key.objectid
@@ expression E; @@
- E.objectid
+ E.root_key.objectid
--- End of file ---


--- Contents of file: ./EXP0-7/final_response_1.claude.cocci ---
@@
expression E;
@@
(
- btrfs_qgroup_free_refroot(E->root->fs_info, E->root->objectid,
+ btrfs_qgroup_free_refroot(E->root->fs_info, E->root->root_key.objectid,
  ...)
|
- btrfs_debug(E, "cleaner removing %llu", root->objectid);
+ btrfs_debug(E, "cleaner removing %llu", root->root_key.objectid);
)
@@
identifier r;
expression E;
@@
(
- r->objectid
+ r->root_key.objectid
|
- r.objectid
+ r.root_key.objectid
)
@@
identifier i, r;
expression E;
@@
(
struct share_check E = {
-  .root_objectid = r->objectid,
+  .root_objectid = r->root_key.objectid,
   ...
};
|
- E->root_objectid = r->objectid;
+ E->root_objectid = r->root_key.objectid;
|
- E.root_objectid = r->objectid;
+ E.root_objectid = r->root_key.objectid;
)
@@
identifier r;


# inspect the best patch for each cluster then combine them

In [31]:
best_patch = [
f"./EXP0-7/final_response_2.{APPROACH}.cocci",
f"./dasd_smalloc/final_response_0.{APPROACH}.cocci",
f"./dma_pool_alloc-52/final_response_0.{APPROACH}.cocci",
f"./early_memunmap/final_response_0.{APPROACH}.cocci",
f"./free_bootmem-77/final_response_0.{APPROACH}.cocci",
f"./kees_timer1/final_response_2.{APPROACH}.cocci",
f"./perf_evlist__mmap-69/final_response_0.{APPROACH}.cocci",
f"./random_ether_addr-84/final_response_0.{APPROACH}.cocci",
f"./snd_soc/final_response_0.{APPROACH}.cocci",
f"./sock_poll_wait-84/final_response_0.{APPROACH}.cocci",
f"./tcaction/final_response_2.{APPROACH}.cocci",
f"./tcf_block_get-61/final_response_2.{APPROACH}.cocci",
f"./ttm_bo_init-60/final_response_1.{APPROACH}.cocci",
f"./uartlite/final_response_0.{APPROACH}.cocci"
]

In [32]:
len(best_patch)

14

In [33]:
df.root.unique()

array(['./EXP0-7', './tcf_block_get-61', './sock_poll_wait-84',
       './random_ether_addr-84', './snd_soc', './ttm_bo_init-60',
       './uartlite', './perf_evlist__mmap-69', './early_memunmap',
       './kees_timer1', './dma_pool_alloc-52', './free_bootmem-77',
       './tcaction', './dasd_smalloc'], dtype=object)

In [34]:
df[~df.root.isin(df[df.cocci_fullpath.isin(best_patch)].root.unique())].root.unique()

array([], dtype=object)

In [35]:
df[df.cocci_fullpath.isin(best_patch)].shape

(60, 12)

In [36]:
import re
# get multiple semantic patches into 1 list to be merged
folder_to_sp = {}
for path in best_patch:
    folder = path.split("/")[1]
    pattern = r"_c[0-9]+"

    if folder not in folder_to_sp:
        folder_to_sp[folder] = []
    
    with open(path, "r") as f:
        sp = f.read()
    
    folder_to_sp[folder].append(sp)

In [37]:
folder_to_sp

{'EXP0-7': ['@@\nexpression E;\n@@\n// Match and transform root->objectid to root->root_key.objectid\n(\n- E->objectid\n+ E->root_key.objectid\n)'],
 'dasd_smalloc': ['@@ \nexpression magic, count1, count2, device;\nexpression req;\n@@\n(\n// When we have a request parameter available in the context - use it\n- dasd_smalloc_request(magic, count1, count2, device)\n+ dasd_smalloc_request(magic, count1, count2, device, blk_mq_rq_to_pdu(req))\n|\n// For all other cases - use NULL\n- dasd_smalloc_request(magic, count1, count2, device)\n+ dasd_smalloc_request(magic, count1, count2, device, NULL)\n)'],
 'dma_pool_alloc-52': ['@@\nexpression E1, E2, E3;\nexpression ptr;\nexpression size;\n@@\n-ptr = dma_pool_alloc(E1, E2, E3);\n+ptr = dma_pool_zalloc(E1, E2, E3);\n...\nwhen != ptr = E1\n-memset(ptr, 0, size);'],
 'early_memunmap': ['@@\nexpression addr, size;\n@@\n- early_iounmap(addr, size)\n+ early_memunmap(addr, size)'],
 'free_bootmem-77': ['@@\nexpression addr, size;\n@@\n- free_bootmem(a

In [38]:
import shutil
import re

TEST_PATH = "../test/"
FILENAME = f"sp_out.final.{APPROACH}.cocci"
for directory, patches in folder_to_sp.items():
    combined_patch = ""
    for p in patches:
        combined_patch += p
        combined_patch += "\n\n\n"

    # source_filepath = os.path.join(".", directory, FILENAME)
    pattern = r"_c[0-9]+"
    matches = re.findall(pattern, directory)
    if matches:
        directory = directory.split("_")[:-1]
        directory = "_".join(directory)

    target_filepath = os.path.join(TEST_PATH, directory, FILENAME) 
    with open(target_filepath, "w") as f:
        f.write(combined_patch)
    print(f"Saved the final patch to: {target_filepath}")
    # shutil.copy(source_filepath, target_filepath)

Saved the final patch to: ../test/EXP0-7/sp_out.final.claude.cocci
Saved the final patch to: ../test/dasd_smalloc/sp_out.final.claude.cocci
Saved the final patch to: ../test/dma_pool_alloc-52/sp_out.final.claude.cocci
Saved the final patch to: ../test/early_memunmap/sp_out.final.claude.cocci
Saved the final patch to: ../test/free_bootmem-77/sp_out.final.claude.cocci
Saved the final patch to: ../test/kees_timer1/sp_out.final.claude.cocci
Saved the final patch to: ../test/perf_evlist__mmap-69/sp_out.final.claude.cocci
Saved the final patch to: ../test/random_ether_addr-84/sp_out.final.claude.cocci
Saved the final patch to: ../test/snd_soc/sp_out.final.claude.cocci
Saved the final patch to: ../test/sock_poll_wait-84/sp_out.final.claude.cocci
Saved the final patch to: ../test/tcaction/sp_out.final.claude.cocci
Saved the final patch to: ../test/tcf_block_get-61/sp_out.final.claude.cocci
Saved the final patch to: ../test/ttm_bo_init-60/sp_out.final.claude.cocci
Saved the final patch to: ../t

# check final example coverage

- Here, we only care about the precision and recall
    - If tie, then choose the highest one


- the selection of the best patch in the previous step guarantee that the patch will also yield similar result as the choosing the highest precision and recall 
    - note that we also use precision and recall as one of the criterion when selecting the best patch
    - but, the selection process is more refined (i check the overfitting issue)

In [39]:
df.c_filename.nunique()

60

In [40]:
df['rank_score'] = (df['precision'] + df['recall'] + df["is_cocci_valid"]) / 3
best_preds_per_cluster = df.loc[df.groupby(['c_filename'])['rank_score'].idxmax()]
best_clusters = best_preds_per_cluster.loc[
    best_preds_per_cluster.groupby('c_filename')['rank_score'].idxmax()
]
best_clusters = best_clusters.drop(columns=['rank_score'])

In [41]:
best_clusters.shape

(60, 12)

In [42]:
best_clusters

Unnamed: 0,root,cocci_filename,c_filename,pred,is_cocci_valid,is_applied,diff_truth,diff_pred,diff_5,precision,recall,cocci_fullpath
81,./uartlite,final_response_2.claude.cocci,1360605874_2013-02-11_6d53c3b71d32_uartlite_ul...,2,1,True,- if (!(ioread32be(port->membase + ULITE_STAT...,- if (!(ioread32be(port->membase + ULITE_STAT...,"--- initial\n+++ final\n@@ -1,6 +1,6 @@\n stat...",1.0,1.0,./uartlite/final_response_2.claude.cocci
83,./uartlite,final_response_2.claude.cocci,1360605874_2013-02-11_6d53c3b71d32_uartlite_ul...,2,1,True,- int stat = ioread32be(port->membase + U...,- int stat = ioread32be(port->membase + U...,"--- initial\n+++ final\n@@ -2,11 +2,11 @@\n {\...",1.0,1.0,./uartlite/final_response_2.claude.cocci
82,./uartlite,final_response_2.claude.cocci,1360605874_2013-02-11_6d53c3b71d32_uartlite_ul...,2,1,True,- ch = ioread32be(port->membase + ULITE_R...,- ch = ioread32be(port->membase + ULITE_R...,"--- initial\n+++ final\n@@ -7,11 +7,11 @@\n ...",1.0,1.0,./uartlite/final_response_2.claude.cocci
100,./early_memunmap,final_response_2.claude.cocci,1424769208_2015-02-24_8d4a40b_setup_relocate_i...,2,1,True,"- early_iounmap(p, clen + slop);\n+ ...","- early_iounmap(p, clen + slop);\n+ ...","--- initial\n+++ final\n@@ -24,11 +24,11 @@\n ...",1.0,1.0,./early_memunmap/final_response_2.claude.cocci
99,./early_memunmap,final_response_2.claude.cocci,1424769208_2015-02-24_8d4a40bc0651_e820_parse_...,2,1,True,"- early_iounmap(sdata, data_len);\n+ early_m...","- early_iounmap(sdata, data_len);\n+ early_m...","--- initial\n+++ final\n@@ -6,9 +6,9 @@\n sd...",1.0,1.0,./early_memunmap/final_response_2.claude.cocci
101,./early_memunmap,final_response_2.claude.cocci,1424769208_2015-02-24_8d4a40bc0651_setup_reloc...,2,1,True,"- early_iounmap(p, clen + slop);\n+ ...","- early_iounmap(p, clen + slop);\n+ ...","--- initial\n+++ final\n@@ -24,11 +24,11 @@\n ...",1.0,1.0,./early_memunmap/final_response_2.claude.cocci
18,./tcf_block_get-61,final_response_2.claude.cocci,1507896057_2017-10-13_69d78ef25c7b_sch_atm_atm...,2,1,True,"- error = tcf_block_get(&flow->block, &flow->...","- error = tcf_block_get(&flow->block, &flow->...","--- initial\n+++ final\n@@ -87,11 +87,11 @@\n ...",0.5,0.5,./tcf_block_get-61/final_response_2.claude.cocci
19,./tcf_block_get-61,final_response_2.claude.cocci,1507896057_2017-10-13_69d78ef25c7b_sch_drr_drr...,2,1,True,"- err = tcf_block_get(&q->block, &q->filter_l...","- err = tcf_block_get(&q->block, &q->filter_l...","--- initial\n+++ final\n@@ -1,10 +1,10 @@\n st...",0.5,0.5,./tcf_block_get-61/final_response_2.claude.cocci
22,./tcf_block_get-61,final_response_2.claude.cocci,1507896057_2017-10-13_69d78ef25c7b_sch_sfb_sfb...,2,1,True,"- err = tcf_block_get(&q->block, &q->filter_l...","- err = tcf_block_get(&q->block, &q->filter_l...","--- initial\n+++ final\n@@ -1,10 +1,10 @@\n st...",0.5,0.5,./tcf_block_get-61/final_response_2.claude.cocci
114,./kees_timer1,final_response_2.claude.cocci,1508184939_2017-10-16_b9eaf1872222_arcmsr_hba_...,2,1,True,- init_timer(&acb->eternal_timer);\n- acb->e...,,,0.0,0.0,./kees_timer1/final_response_2.claude.cocci


In [43]:
round(best_clusters.precision.mean(), 2), round(best_clusters.recall.mean(), 2), round(best_clusters.drop_duplicates(subset=["root"]).is_cocci_valid.mean(), 2)

(np.float64(0.67), np.float64(0.68), np.float64(0.79))

In [44]:
mean_df = best_clusters.groupby(["root"])[["is_cocci_valid", "precision", "recall"]].mean().round(2)
mean_df

Unnamed: 0_level_0,is_cocci_valid,precision,recall
root,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
./EXP0-7,1.0,0.94,1.0
./dasd_smalloc,0.0,0.0,0.0
./dma_pool_alloc-52,1.0,1.0,1.0
./early_memunmap,1.0,1.0,1.0
./free_bootmem-77,1.0,1.0,1.0
./kees_timer1,1.0,0.67,0.67
./perf_evlist__mmap-69,1.0,1.0,1.0
./random_ether_addr-84,1.0,1.0,1.0
./snd_soc,0.0,0.0,0.0
./sock_poll_wait-84,1.0,1.0,1.0


In [45]:
# Step 1: Group the data by 'root' and 'cocci_filename'
grouped = best_clusters.groupby(['root'])

# Step 2: Compute total samples per group
group_counts = grouped.size().reset_index(name='num_samples')
temp_df = best_clusters.merge(group_counts, on=['root'])

total_samples = temp_df['num_samples'].sum()
weighted_precision = (temp_df['precision'] * temp_df['num_samples']).sum() / total_samples
weighted_recall = (temp_df['recall'] * temp_df['num_samples']).sum() / total_samples


weighted_precision, weighted_recall

(np.float64(0.6111176470588235), np.float64(0.6176470588235294))

In [46]:
mean_df.reset_index(inplace=True)
mean_df["root"] = mean_df["root"].apply(lambda x: x.split("/")[-1])
mean_df.to_csv("result_perdir_claude.csv", index=False)