In [1]:
from pathlib import Path 
import datetime
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from dotenv import load_dotenv

In [3]:
VARIANT = "nomods"
VERSION = "v2-0"
preformat_data_df = pd.read_json(f"data/training_datasets/some_verified_{VARIANT}_{VERSION}.jsonl", lines=True)
preformat_data_df['set'] = 'supervised'
token_count_df = pd.read_csv(f"data/{VARIANT}_file_metrics_{VERSION}.csv")
preformat_data_df = preformat_data_df.merge(token_count_df[['filename', 'num_tokens_model']], on='filename', how='left')
preformat_data_df['filetype'] = preformat_data_df['filetype'].apply(
    lambda x: 'cryptol' if x == 'cry' else 'saw' if x == 'saw' else 'text'
    )
preformat_data_df = preformat_data_df[preformat_data_df["filetype"] != "text"].reset_index(drop=True)

preformat_data_df.head()


Unnamed: 0,filename,filetype,content,n_imports_original,n_imports_final,set,num_tokens_model
0,cryptol/examples/MiniLock/prim/SHA256.cry,cryptol,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ...",,,supervised,2683
1,cryptol/examples/MiniLock/prim/Blake2s.cry,cryptol,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ...",,,supervised,21842
2,cryptol/examples/MiniLock/prim/TestHMAC.cry,cryptol,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ...",,,supervised,254
3,cryptol/examples/MiniLock/prim/bv.cry,cryptol,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ...",,,supervised,605
4,cryptol_slices/cryptol-specs/Primitive/Symmetr...,cryptol,"shift : {d} (fin d, d >= 1) => [d] -> Bit -> [...",0.0,0.0,supervised,56


In [4]:
from src.preprocessing.sft_instruct_preprocess import iter_call_openai_structured, alpaca_df_to_qwen_messages, build_prompt_call_openai_structured

out_path = Path(f"cache/alpaca_instruct_cache/SFT_{VARIANT}_source_code_{VERSION}.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
file_cache_path = f"cache/alpaca_instruct_cache/SFT_{VARIANT}_source_code_{VERSION}.jsonl"

test_df = preformat_data_df.sample(1, random_state=42).reset_index(drop=True)
input_mode="full"

preformat_data_df = preformat_data_df[preformat_data_df["filetype"] == "cryptol"].reset_index(drop=True)


result = iter_call_openai_structured(preformat_data_df, "gpt-5.1", input_mode, file_cache_path)
result.head()

Key  not found in CRYPTOL_VECTOR_STORE_ID.


Response:

ParsedResponse[AlpacaRow](id='resp_09cecf61a8f92b280069483f2bd6c881a1a83baeca4d9f3871', created_at=1766342443.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-5.1-2025-11-13', object='response', output=[ParsedResponseOutputMessage[AlpacaRow](id='msg_09cecf61a8f92b280069483f2d5c7881a19f8a0834f52b5c0b', content=[ParsedResponseOutputText[AlpacaRow](annotations=[], text='{"instruction":"Define a Cryptol module implementing the SHA-256 hash function (including preprocessing, message schedule, compression, constants, and initial value) with top-level function `SHA256 : {a} (fin a, 64 >= width (8*a)) => [a][8] -> [256]` and a property `katsPass` that checks this implementation against standard SHA-256 known-answer test vectors.","input":"","output":""}', type='output_text', logprobs=[], parsed=AlpacaRow(instruction='Define a Cryptol module implementing the SHA-256 hash function (including preprocessing, message schedule, compression, constants, and

Unnamed: 0,filename,filetype,set,instruction,input,output,content
0,cryptol/examples/MiniLock/prim/SHA256.cry,cryptol,supervised,Define a Cryptol module implementing the SHA-2...,,,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ..."
1,cryptol/examples/MiniLock/prim/Blake2s.cry,cryptol,supervised,Write a Cryptol module implementing the BLAKE2...,,,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ..."
2,cryptol/examples/MiniLock/prim/TestHMAC.cry,cryptol,supervised,Write a Cryptol module `TestHMAC` that imports...,,,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ..."
3,cryptol/examples/MiniLock/prim/bv.cry,cryptol,supervised,Write a Cryptol specification for the `bv` mod...,,,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ..."
4,cryptol_slices/cryptol-specs/Primitive/Symmetr...,cryptol,supervised,"Define the function `shift : {d} (fin d, d >= ...",,,"shift : {d} (fin d, d >= 1) => [d] -> Bit -> [..."


In [6]:
result = alpaca_df_to_qwen_messages(
    result, 
    output="content",
    #system_prompt="You are a meticulous assistant that writes formal specifications and verification code for Cryptol programs.",
    drop_input=True,
    include_filename_in_user=False
    )
    
result.head()

Unnamed: 0,messages,filename,filetype,set
0,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/MiniLock/prim/SHA256.cry,cryptol,supervised
1,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/MiniLock/prim/Blake2s.cry,cryptol,supervised
2,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/MiniLock/prim/TestHMAC.cry,cryptol,supervised
3,"[{'role': 'system', 'content': 'Return exactly...",cryptol/examples/MiniLock/prim/bv.cry,cryptol,supervised
4,"[{'role': 'system', 'content': 'Return exactly...",cryptol_slices/cryptol-specs/Primitive/Symmetr...,cryptol,supervised


In [7]:

result = result.merge(
    preformat_data_df[["filename"]],
    on="filename",
    how="left"
)
reorder_df = result[["filename", "filetype", "set", "messages"]].copy()
reorder_df.head()


Unnamed: 0,filename,filetype,set,messages
0,cryptol/examples/MiniLock/prim/SHA256.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
1,cryptol/examples/MiniLock/prim/Blake2s.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
2,cryptol/examples/MiniLock/prim/TestHMAC.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
3,cryptol/examples/MiniLock/prim/bv.cry,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."
4,cryptol_slices/cryptol-specs/Primitive/Symmetr...,cryptol,supervised,"[{'role': 'system', 'content': 'Return exactly..."


In [None]:
for idx, row in reorder_df.iterrows():
    print("Filename:")
    print(row['filename'])
    message = row['messages']
    print(f"System: {message[0]['content']}")
    print(f"User: \n{message[1]['content']}")
    print("Output:")
    print(row['output'])
    print("="*50)

In [8]:
reorder_df.to_json(f"data/training_datasets/SFT_message_format_{VARIANT}_{VERSION}.jsonl", orient="records", lines=True)

In [9]:
training_df = reorder_df[reorder_df["set"] != "holdout"].reset_index(drop=True)

training_df.drop(columns=['set'], inplace=True)
training_df.head()

Unnamed: 0,filename,filetype,messages
0,cryptol/examples/MiniLock/prim/SHA256.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
1,cryptol/examples/MiniLock/prim/Blake2s.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
2,cryptol/examples/MiniLock/prim/TestHMAC.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
3,cryptol/examples/MiniLock/prim/bv.cry,cryptol,"[{'role': 'system', 'content': 'Return exactly..."
4,cryptol_slices/cryptol-specs/Primitive/Symmetr...,cryptol,"[{'role': 'system', 'content': 'Return exactly..."


In [10]:
training_df.to_json(f"data/training_datasets/{VARIANT}_message_format_{VERSION}.jsonl", orient="records", lines=True)