In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [3]:
VARIATION = "nocomments"
VERSION = config["version"]

source_code_df = pd.read_json(f"data/clean_datasets/{VARIATION}_{VERSION}.jsonl", lines=True)
source_code_df['filetype'] = source_code_df['filetype'].apply(
    lambda x: 'cryptol' if x == 'cry' else 'saw' if x == 'saw' else 'text'
    )

mcc_df = pd.read_json(f"data/clean_datasets/mcc_stats_{VARIATION}_{VERSION}.jsonl", lines=True)

add_columns = mcc_df.columns.difference(source_code_df.columns).tolist()
print(add_columns)
source_code_df = source_code_df.merge(
    mcc_df[["filename", *add_columns]],
    on="filename",
    how="left",
    validate="one_to_one"
)
#source_code_df.head()


['avg_mcc', 'imports', 'imports_count', 'json_path', 'max_mcc', 'num_declarations', 'num_declarations_with_mcc', 'num_definitions', 'num_types', 'total_mcc']


In [4]:
from src.agent.generate import ModelConfig, TemplateBundle, iter_call_pydantic_ai, build_prompt_call_pydantic_ai

file_cache_path = Path(f"cache/alpaca_instruct_cache/SFT_{VARIATION}_source_code_{VERSION}.jsonl")
file_cache_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing

#test_df = source_code_df.sample(10, random_state=42).reset_index(drop=True)

modelConfig = ModelConfig(
    model="openai:gpt-5.1",
    retries=2
)

template = TemplateBundle(
    spec_system="system_spec_simple_gen.j2",
    spec_user="user_spec_simple_gen.j2"
)

result = iter_call_pydantic_ai(
    source_code_df, 
    model_cfg=modelConfig,
    spec_templates=template,
    input_mode="none",
    file_cache_path=file_cache_path
)
result.head()

Processing row 0 / 832: AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry
Processing row 10 / 832: aws-lc-verification/cryptol-specs/Common/ModDivZ.cry
Processing row 20 / 832: aws-lc-verification/cryptol-specs/Primitive/Symmetric/Cipher/Block/Cipher.cry
Processing row 30 / 832: aws-lc-verification/cryptol-specs/Primitive/Symmetric/Cipher/Block/Modes/CBC.cry
Processing row 40 / 832: aws-lc-verification/cryptol-specs/Primitive/Symmetric/Cipher/Block/AES/Round.cry
Processing row 50 / 832: aws-lc-verification/cryptol-specs/Common/EC/p384_field.cry
Processing row 60 / 832: aws-lc-verification/cryptol-specs/McEliece_KEM/spec/Key_Generation.cry
Processing row 70 / 832: aws-lc-verification/SAW/spec/AES_KW/X86.cry
Processing row 80 / 832: aws-lc-verification/cryptol-specs-aes-gcm/Primitive/Symmetric/Cipher/Block/Modes/TDES_CBC.cry
Processing row 90 / 832: aws-lc-verification/cryptol-specs-aes-gcm/Primitive/Keyless/Hash/SHA3/SHA3_224.cry
Processing row 100 / 832: aws-lc-verification/cryptol-specs-a

Unnamed: 0,filename,filetype,set,instruction,input,output,content
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cryptol,,Write a Cryptol module named AES that defines ...,,,module AES where\n\nimport `Common::AES\n\ntyp...
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cryptol,,Write a Cryptol module named AES128 that expos...,,,module AES128 where\n\nimport `Common::AES\nim...
2,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cryptol,,Write a Cryptol module named AES256 that defin...,,,module AES256 where\n\nimport `Common::AES\nim...
3,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cryptol,,Write a detailed Cryptol specification that ca...,,,type Nb = 4\ntype State = [4][Nb]...
4,AES-GCM-SIV-proof/proof/cryptol-specs/Common/A...,cryptol,,Write a Cryptol module that defines a paramete...,,,module Common::AES where\n\nparameter\n type ...


In [5]:

import textwrap

WRAP = 80

result_sample = result.sample(10, random_state=42).reset_index(drop=True)

for idx, row in result_sample.iterrows():
    print(f"=== Example {idx}: {row['filename']} ===")
    instr = str(row["instruction"] or "")
    wrapped_instr = "\n\n".join(
        textwrap.fill(p, width=WRAP, break_long_words=False, break_on_hyphens=False)
        for p in instr.splitlines()
    )
    print(f"Instruction:\n{wrapped_instr}\n")
    print(f"Source Code:\n{'=' * WRAP}\n{row['content']}\n{'=' * WRAP}")


=== Example 0: cryptol_slices/cryptol/examples/funstuff/FoxChickenCorn/010_stuffOnlyMovedWithFarmer.cry ===
Instruction:
Write a detailed Cryptol specification for a state-transition predicate
stuffOnlyMovedWithFarmer in the classic fox–chicken–corn river crossing puzzle.
Represent one river bank as a 4-bit word where each bit indicates the presence
of farmer, fox, chicken, and corn. A BankState should be a record with left and
right banks. The predicate takes two BankState values (before and after a move)
and returns true exactly when: the farmer changes banks, at most one non-farmer
object moves with the farmer, and only items on the farmer’s bank may move.
Include any helper predicates (e.g., farmerHere, moveFollowsRules, popCount) and
appropriate polymorphic typing constraints.

Source Code:
type OneBank = [4]
type BankState = {left : OneBank, right : OneBank}

farmer = 0x1

stuffOnlyMovedWithFarmer : BankState -> BankState -> Bit
stuffOnlyMovedWithFarmer b b' =
  if farmerHere b.l