In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [3]:
VARIATION = "nocomments"
VERSION = config["version"]

source_code_df = pd.read_json(f"data/clean_datasets/{VARIATION}_{VERSION}.jsonl", lines=True)
source_code_df['filetype'] = source_code_df['filetype'].apply(
    lambda x: 'cryptol' if x == 'cry' else 'saw' if x == 'saw' else 'text'
    )

mcc_df = pd.read_json(f"data/clean_datasets/mcc_stats_{VARIATION}_{VERSION}.jsonl", lines=True)

add_columns = mcc_df.columns.difference(source_code_df.columns).tolist()
print(add_columns)
source_code_df = source_code_df.merge(
    mcc_df[["filename", *add_columns]],
    on="filename",
    how="left",
    validate="one_to_one"
)
#source_code_df.head()


['avg_mcc', 'avg_mcc_difficulty_bucket', 'imports', 'imports_count', 'json_path', 'max_mcc', 'num_declarations', 'num_declarations_with_mcc', 'num_definitions', 'num_types', 'split', 'total_mcc']


In [4]:
from src.agent.generate import ModelConfig, TemplateBundle, iter_call_pydantic_ai, build_prompt_call_pydantic_ai

file_cache_path = Path(f"cache/alpaca_instruct_cache/SFT_{VARIATION}_source_code_{VERSION}.jsonl")
file_cache_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing

#test_df = source_code_df.sample(10, random_state=42).reset_index(drop=True)

modelConfig = ModelConfig(
    model="openai:gpt-5.1",
    retries=2
)

template = TemplateBundle(
    spec_system="system_spec_simple_gen.j2",
    spec_user="user_spec_simple_gen.j2"
)


    # Do something with the masked result

In [8]:
from src.data_s.util import mask_declaration_in_source
from src.data_s.mcc_tools import  load_json, get_hardest_definition





for idx, row in source_code_df.iterrows():
    mcc = load_json(row["json_path"])
    print(f"{row['filename']}\n{'=' * 80}")
    definition = get_hardest_definition(mcc)
    masked_result = mask_declaration_in_source(
        source=row["content"],
        name=definition["name"],
        params=definition.get("params", []),
        hole_name="/* Finish this definition. */"
    
    )
    print(f"Masked Source:\n{masked_result.masked_source}")
    print(f"Definition:\n{masked_result.removed_definition}")

    print(f"{'=' * 80}")

    if idx == 6:
        break



AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry
Masked Source:
module AES where

import `Common::AES

type constraint ValidKey k m = (k == 128 + m * 64, 2 >= m)

type ExpandedKey m = KeySchedule m

encrypt : {k,m} ValidKey k m => [k] -> [128] -> [128]
encrypt = /* Finish this definition. */

expandKey : {k,m} ValidKey k m => [k] -> ExpandedKey m
expandKey = ExpandKey`{Mode = m}

encryptWithSchedule : {k,m} ValidKey k m => ExpandedKey m -> [128] -> [128]
encryptWithSchedule = aesEncryptWithSchedule`{Mode = m}

property test k pt = encrypt k pt == encryptWithSchedule (expandKey k) pt
Definition:
encrypt = aesEncrypt`{Mode = m}

AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry
Masked Source:
module AES128 where

import `Common::AES
import `Common::AES as AES
import AES as AES

type State         = AES::State 0  
type KeySchedule   = AES::KeySchedule 0
type RoundKey      = AES::RoundKey 0

sub_bytes128 : State -> State
sub_bytes128 = SubBytes`{0}

shift_rows128 : State -> State
shift_rows128

In [None]:
print(masked_result.masked_source)

module AES where

import `Common::AES

type constraint ValidKey k m = (k == 128 + m * 64, 2 >= m)

type ExpandedKey m = KeySchedule m

encrypt : {k,m} ValidKey k m => [k] -> [128] -> [128]
encrypt = aesEncrypt`{Mode = m}

expandKey : {k,m} ValidKey k m => [k] -> ExpandedKey m
expandKey = ExpandKey`{Mode = m}

encryptWithSchedule : {k,m} ValidKey k m => ExpandedKey m -> [128] -> [128]
encryptWithSchedule = /* Finish this definition. */

property test k pt = encrypt k pt == encryptWithSchedule (expandKey k) pt


In [None]:
from src.data_s.mcc_tools import  *

for idx, row in source_code_df.iterrows():
    mcc = load_json(row["json_path"])
    print(f"{row['filename']}\n{'=' * 80}")
    print(get_hardest_definition(mcc))
    print(f"{'=' * 80}")
    if idx == 6:
        break

"".split()

AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry
{'kind': 'declaration', 'locals': [], 'mcc': {'edges': [{'from': 0, 'kind': 'control', 'label': None, 'to': 1}, {'from': 1, 'kind': 'control', 'label': None, 'to': 2}], 'entry': 0, 'exit': 2, 'nodes': [{'details': [], 'id': 0, 'kind': 'entry', 'label': 'encrypt'}, {'details': [], 'id': 1, 'kind': 'op', 'label': 'aesEncrypt'}, {'details': [], 'id': 2, 'kind': 'exit', 'label': 'exit'}]}, 'name': 'encrypt', 'params': [], 'references': [{'op': 'ValidKey'}, {'op': 'aesEncrypt'}, {'op': 'k'}, {'op': 'm'}], 'signature': '{k, m} (ValidKey k m) => [k] -> [128] -> [128]'}
AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry
{'kind': 'declaration', 'locals': [], 'mcc': {'edges': [{'from': 0, 'kind': 'control', 'label': None, 'to': 1}, {'from': 1, 'kind': 'control', 'label': None, 'to': 2}], 'entry': 0, 'exit': 2, 'nodes': [{'details': [], 'id': 0, 'kind': 'entry', 'label': 'aes_final_round128'}, {'details': [], 'id': 1, 'kind': 'import', 'label': 'AES::ae

[]

In [None]:
source_code_df.head()

Unnamed: 0,filename,filetype,content,variant,n_imports_original,n_imports_final,avg_mcc,avg_mcc_difficulty_bucket,imports,imports_count,json_path,max_mcc,num_declarations,num_declarations_with_mcc,num_definitions,num_types,split,total_mcc
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,module AES where\n\nimport `Common::AES\n\ntyp...,without_comments,,,1.0,1,[`Common::AES],1,/Users/josh/Automated_Reasoning_for_Cryptograp...,1,4,4,5,1,train,4
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,module AES128 where\n\nimport `Common::AES\nim...,without_comments,,,1.0,1,"[`Common::AES, `Common::AES as AES, AES as AES]",3,/Users/josh/Automated_Reasoning_for_Cryptograp...,1,6,6,9,3,test,6
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,module AES256 where\n\nimport `Common::AES\nim...,without_comments,,,1.0,1,"[`Common::AES, `Common::AES as AES, AES as AES]",3,/Users/josh/Automated_Reasoning_for_Cryptograp...,1,6,6,9,3,train,6
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,type Nb = 4\ntype State = [4][Nb]...,without_comments,,,1.352941,4,[],0,/Users/josh/Automated_Reasoning_for_Cryptograp...,3,17,17,21,4,train,23
4,AES-GCM-SIV-proof/proof/cryptol-specs/Common/A...,cryptol,module Common::AES where\n\nparameter\n type ...,without_comments,,,1.358491,4,[],0,/Users/josh/Automated_Reasoning_for_Cryptograp...,3,53,53,61,8,train,72
