In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [3]:
VARIATION = "nocomments"
VERSION = config["version"]

source_code_df = pd.read_json(f"data/clean_datasets/{VARIATION}_{VERSION}.jsonl", lines=True)
source_code_df['filetype'] = source_code_df['filetype'].apply(
    lambda x: 'cryptol' if x == 'cry' else 'saw' if x == 'saw' else 'text'
    )

mcc_df = pd.read_json(f"data/clean_datasets/mcc_stats_{VARIATION}_{VERSION}.jsonl", lines=True)

add_columns = mcc_df.columns.difference(source_code_df.columns).tolist()
print(add_columns)
source_code_df = source_code_df.merge(
    mcc_df[["filename", *add_columns]],
    on="filename",
    how="left",
    validate="one_to_one"
)
#source_code_df.head()


['avg_mcc', 'avg_mcc_difficulty_bucket', 'imports', 'imports_count', 'json_path', 'max_mcc', 'num_declarations', 'num_declarations_with_mcc', 'num_definitions', 'num_types', 'split', 'total_mcc']


In [4]:
from src.data_s.util import build_masked_examples_df

masked_df = build_masked_examples_df(source_code_df=source_code_df.iloc[:25])

masked_df = masked_df.merge(
    mcc_df[["filename", "num_declarations", "total_mcc"]],
    on="filename",
    how="left"
)

masked_df.head()

0: Processing AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry
1: Processing AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry
2: Processing AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry
3: Processing AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry
4: Processing AES-GCM-SIV-proof/proof/cryptol-specs/Common/AES.cry
5: Processing AES-GCM-SIV-proof/proof/asm/cryptol/Asm128.cry
6: Processing AES-GCM-SIV-proof/proof/asm/cryptol/AES128_GCM_SIV.cry
7: Processing AES-GCM-SIV-proof/proof/asm/cryptol/X86.cry
8: Processing aws-lc-verification/cryptol-specs/Common/Field.cry
9: Processing aws-lc-verification/cryptol-specs/Common/mod_arith.cry
10: Processing aws-lc-verification/cryptol-specs/Common/ModDivZ.cry
11: Processing aws-lc-verification/cryptol-specs/Common/mul_java.cry
12: Processing aws-lc-verification/cryptol-specs/Common/Set.cry
13: Processing aws-lc-verification/cryptol-specs/Common/Morphism.cry
14: Processing aws-lc-verification/cryptol-specs/Common/bv.cry
15: Processing aws-lc-verificat

Unnamed: 0,filename,filetype,json_path,def_name,def_params,masked_source,target_definition,hole_name,content,num_declarations,total_mcc
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,/Users/josh/Automated_Reasoning_for_Cryptograp...,encrypt,[],module AES where\n\nimport `Common::AES\n\ntyp...,encrypt = aesEncrypt`{Mode = m}\n,/* Finish this definition. */,module AES where\n\nimport `Common::AES\n\ntyp...,4,4
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,/Users/josh/Automated_Reasoning_for_Cryptograp...,aes_final_round128,[],module AES128 where\n\nimport `Common::AES\nim...,aes_final_round128 = AES::aes_final_round`{0}\n,/* Finish this definition. */,module AES128 where\n\nimport `Common::AES\nim...,6,6
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,/Users/josh/Automated_Reasoning_for_Cryptograp...,aes_final_round256,[],module AES256 where\n\nimport `Common::AES\nim...,aes_final_round256 = AES::aes_final_round`{2}\n,/* Finish this definition. */,module AES256 where\n\nimport `Common::AES\nim...,6,6
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,/Users/josh/Automated_Reasoning_for_Cryptograp...,gf28Pow,"[k, n]",type Nb = 4\ntype State = [4][Nb]...,gf28Pow n k = pow k\n where sq x = gf28Mul...,/* Finish this definition. */,type Nb = 4\ntype State = [4][Nb]...,17,23
4,AES-GCM-SIV-proof/proof/cryptol-specs/Common/A...,cryptol,/Users/josh/Automated_Reasoning_for_Cryptograp...,gf28Pow,"[k, n]",module Common::AES where\n\nparameter\n type ...,"gf28Pow (n, k) = pow k\n where sq x = gf28...",/* Finish this definition. */,module Common::AES where\n\nparameter\n type ...,53,72


In [None]:
from src.agent.generate import ModelConfig, TemplateBundle, iter_call_masked_instruction_pydantic_ai, build_prompt_call_pydantic_ai


file_cache_path = Path(f"cache/alpaca_instruct_cache/SFT_{VARIATION}_Masked_Instruction_{VERSION}.jsonl")
file_cache_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing

#test_df = source_code_df.sample(10, random_state=42).reset_index(drop=True)

modelConfig = ModelConfig(
    model="openai:gpt-5.1",
    retries=2
)

template = TemplateBundle(
    spec_system="system_spec_masked_instruction.j2",
    spec_user="user_spec_masked_instruction.j2"
)

result = iter_call_masked_instruction_pydantic_ai(
    masked_df,
    model_cfg=modelConfig,
    spec_templates=template,
    input_mode="none",
    file_cache_path=file_cache_path
)
result.head()

Processing row 0 / 23: AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry
File path: AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry

Relevant Cryptol knowledge (for your analysis only — do not quote verbatim and do not include any code in the instruction):
-----8<-----
SOURCE: /Users/josh/Automated_Reasoning_for_Cryptography/DataPreprocess/text/Cryptol-Reference-Manual-MD/Foreign-Function-Interface.md
### Quick reference


| Cryptol type (or kind) | C argument type(s) | C return type | C output argument type(s) |
|----|----|----|----|
| `#` | `size_t` | N/A | N/A |
| `Bit` | `uint8_t` | `uint8_t` | `uint8_t*` |
| `[K]Bit` where `0  <= K <= 8` | `uint8_t` | `uint8_t` | `uint8_t*` |
| `[K]Bit` where `8  <  K <= 16` | `uint16_t` | `uint16_t` | `uint16_t*` |
| `[K]Bit` where `16 <  K <= 32` | `uint32_t` | `uint32_t` | `uint32_t*` |
| `[K]Bit` where `32 <  K <= 64` | `uint64_t` | `uint64_t` | `uint64_t*` |
| `Float32` | `float` | `float` | `float*` |
| `Float64` | `double` | `double` | `double*` 

Unnamed: 0,filename,filetype,split,avg_mcc_difficulty_bucket,instruction,input,output,masked_source,target_definition
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,,,In the given Cryptol module implementing AES u...,,,module AES where\n\nimport `Common::AES\n\ntyp...,encrypt = aesEncrypt`{Mode = m}\n
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,,,"In the given AES128 Cryptol module, define the...",,,module AES128 where\n\nimport `Common::AES\nim...,aes_final_round128 = AES::aes_final_round`{0}\n
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,,,In the provided Cryptol module implementing AE...,,,module AES256 where\n\nimport `Common::AES\nim...,aes_final_round256 = AES::aes_final_round`{2}\n
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,,,In the provided Cryptol AES-related specificat...,,,type Nb = 4\ntype State = [4][Nb]...,gf28Pow n k = pow k\n where sq x = gf28Mul...
4,AES-GCM-SIV-proof/proof/cryptol-specs/Common/A...,cryptol,,,In the Cryptol module implementing AES over GF...,,,module Common::AES where\n\nparameter\n type ...,"gf28Pow (n, k) = pow k\n where sq x = gf28..."


In [None]:

import textwrap

WRAP = 80

result_sample = result.sample(20, random_state=42).reset_index(drop=True)

for idx, row in result_sample.iterrows():
    print(f"=== Example {idx}: {row['filename']} ===")
    instr = str(row["instruction"] or "")
    wrapped_instr = "\n\n".join(
        textwrap.fill(p, width=WRAP, break_long_words=False, break_on_hyphens=False)
        for p in instr.splitlines()
    )
    print(f"Instruction:\n{wrapped_instr}\n")
    print(f"Source Code:\n{'=' * WRAP}\n{row['masked_source']}\n{'=' * WRAP}")



=== Example 0: AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry ===
Instruction:
In the given Cryptol module implementing AES using Common::AES, define the
encrypt function for keys of length k and 128-bit blocks, using the appropriate
Common::AES encryption primitive parameterized by Mode = m, consistent with
expandKey and encryptWithSchedule. Provide only the encrypt definition and
briefly explain how it relates to key expansion and scheduling.

Source Code:
module AES where

import `Common::AES

type constraint ValidKey k m = (k == 128 + m * 64, 2 >= m)

type ExpandedKey m = KeySchedule m

encrypt : {k,m} ValidKey k m => [k] -> [128] -> [128]
encrypt = /* Finish this definition. */

expandKey : {k,m} ValidKey k m => [k] -> ExpandedKey m
expandKey = ExpandKey`{Mode = m}

encryptWithSchedule : {k,m} ValidKey k m => ExpandedKey m -> [128] -> [128]
encryptWithSchedule = aesEncryptWithSchedule`{Mode = m}

property test k pt = encrypt k pt == encryptWithSchedule (expandKey k) pt
=== Example 

In [7]:
from src.data_s.mcc_tools import  *

for idx, row in source_code_df.iterrows():
    mcc = load_json(row["json_path"])
    print(f"{row['filename']}\n{'=' * 80}")
    print(get_hardest_definition(mcc))
    print(f"{'=' * 80}")
    if idx == 6:
        break

"".split()

AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry
{'kind': 'declaration', 'locals': [], 'mcc': {'edges': [{'from': 0, 'kind': 'control', 'label': None, 'to': 1}, {'from': 1, 'kind': 'control', 'label': None, 'to': 2}], 'entry': 0, 'exit': 2, 'nodes': [{'details': [], 'id': 0, 'kind': 'entry', 'label': 'encrypt'}, {'details': [], 'id': 1, 'kind': 'op', 'label': 'aesEncrypt'}, {'details': [], 'id': 2, 'kind': 'exit', 'label': 'exit'}]}, 'name': 'encrypt', 'params': [], 'references': [{'op': 'ValidKey'}, {'op': 'aesEncrypt'}, {'op': 'k'}, {'op': 'm'}], 'signature': '{k, m} (ValidKey k m) => [k] -> [128] -> [128]'}
AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry
{'kind': 'declaration', 'locals': [], 'mcc': {'edges': [{'from': 0, 'kind': 'control', 'label': None, 'to': 1}, {'from': 1, 'kind': 'control', 'label': None, 'to': 2}], 'entry': 0, 'exit': 2, 'nodes': [{'details': [], 'id': 0, 'kind': 'entry', 'label': 'aes_final_round128'}, {'details': [], 'id': 1, 'kind': 'import', 'label': 'AES::ae

[]