In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from src.preprocessing.comment_process import normalize_blanklines, strip_cryptol_comments_all

In [3]:
VERSION = config["version"]

JSONL_PATH_ORIGINAL = f"data/all_sources_verified_{VERSION}.jsonl"
JSONL_PATH_SLICES = f"data/some_slices_verified_{VERSION}.jsonl"

original_df = pd.read_json(JSONL_PATH_ORIGINAL, lines=True)
slices_df = pd.read_json(JSONL_PATH_SLICES, lines=True)

nomod_df = pd.concat(
    [original_df, slices_df.loc[: , ["filename", "filetype", "content"]].copy()],
    ignore_index=True
    )

In [4]:
nomod_df.head()

Unnamed: 0,filename,filetype,content
0,cryptol/examples/AES.cry,cry,// Cryptol AES Implementation\n// Copyright (c...
1,cryptol/examples/width.cry,cry,x : [8]\nx = length (252 : [8])\n
2,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([..."
3,cryptol/examples/AE.cry,cry,// WORK IN PROGRESS\n\n/*\nImplementation of t...
4,cryptol/examples/Cipher.cry,cry,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ..."


In [5]:
nocomment_df = nomod_df.copy()
nocomment_df["content"] = nocomment_df["content"].apply(strip_cryptol_comments_all)
nocomment_df["content"] = nocomment_df["content"].apply(normalize_blanklines)

In [6]:
nocomment_df.head(20)

Unnamed: 0,filename,filetype,content
0,cryptol/examples/AES.cry,cry,module AES where\n\ntype AES128 = 4\ntype AES1...
1,cryptol/examples/width.cry,cry,x : [8]\nx = length (252 : [8])\n
2,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([..."
3,cryptol/examples/AE.cry,cry,module AE where\n\nparameter\n type A : * ...
4,cryptol/examples/Cipher.cry,cry,module Cipher where\n\ntype Cipher KeySize Blo...
5,cryptol/examples/xor_cipher.cry,cry,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...
6,cryptol/examples/zero_weird.cry,cry,x : {a}() => a -> [16]\nx v = zero v \n\nprope...
7,cryptol/examples/DES.cry,cry,module DES where\n\nimport Cipher\n\nDES : Cip...
8,cryptol/examples/builtin_lifting.cry,cry,"x = [True,False]\ny = [False,True]\n\nproperty..."
9,cryptol/examples/inflist.cry,cry,"a = [1 ... ]\nb = [1,2 ... ]\nc = [1 .. 5]\nd ..."


In [7]:
nocomment_df.tail(20)

Unnamed: 0,filename,filetype,content
15555,cryptol/cryptol-remote-api/python/tests/crypto...,cry,q : Rational\nq = ratio 5 4\n
15556,cryptol/cryptol-remote-api/python/tests/crypto...,cry,"t : (Bit, Integer)\nt = (False, 7)\n"
15557,cryptol/cryptol-remote-api/python/tests/crypto...,cry,id : {n} (fin n) => [n] -> [n]\nid a = a\n
15558,cryptol/cryptol-remote-api/python/tests/crypto...,cry,w : [16]\nw = 42\n
15559,cryptol/cryptol-remote-api/python/tests/crypto...,cry,"r : {xCoord : [32], yCoord : [32]}\nr = {xCoor..."
15560,cryptol/cryptol-remote-api/python/tests/crypto...,cry,m : Z 12\nm = 6\n
15561,cryptol/cryptol-remote-api/python/tests/crypto...,cry,z : Integer\nz = 420000\n
15562,cryptol/cryptol-remote-api/python/tests/crypto...,cry,type Q = Rational\n\nfortyTwo : Q\nfortyTwo = ...
15563,cryptol/cryptol-remote-api/python/tests/crypto...,cry,id : {n} (fin n) => [n] -> [n]\nid a = a\n
15564,cryptol/cryptol-remote-api/python/tests/crypto...,cry,f : [8] -> [2][8]\nf x = repeat x\n


In [8]:
nocomment_df["variant"] = "without_comments"
nocomment_df.head()

Unnamed: 0,filename,filetype,content,variant
0,cryptol/examples/AES.cry,cry,module AES where\n\ntype AES128 = 4\ntype AES1...,without_comments
1,cryptol/examples/width.cry,cry,x : [8]\nx = length (252 : [8])\n,without_comments
2,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([...",without_comments
3,cryptol/examples/AE.cry,cry,module AE where\n\nparameter\n type A : * ...,without_comments
4,cryptol/examples/Cipher.cry,cry,module Cipher where\n\ntype Cipher KeySize Blo...,without_comments


In [9]:

OUTPUT_DIR = Path("data/good_syntax")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

nocomment_df.to_json(
    OUTPUT_DIR / f"verified_nocomments_{VERSION}.jsonl", 
    orient="records", 
    lines=True, 
    force_ascii=False
    )


In [None]:
from src.preprocessing.comment_extractor import extract_strip_cry_comments
# Create Hybrid Dataset
out_path = Path("cache/GPT_comment_decisions_cache.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
DECISION_CACHE_PATH = "cache/GPT_comment_decisions_cache.jsonl"
comment_rows = []
dataset_rows = []
for index, row in nomod_df.iterrows():
    comments, file_record_ = extract_strip_cry_comments(
        filename=row.filename,
        content=row.content,
        llm_model_name="gpt-oss:20b",
        decision_cache_path=DECISION_CACHE_PATH
    )
    comment_rows.extend(comments)
    file_record = {
        "filename": file_record_["filename"],
        "filetype": row.filetype,
        "content": file_record_["content"],
        "variant": "hybrid"
    }
    dataset_rows.append(file_record)

comment_df = pd.DataFrame(comment_rows)
hybrid_df = pd.DataFrame(dataset_rows)



In [None]:
comment_df.head()

Unnamed: 0,filename,sha1,comment,keep,snippet
0,AES-GCM-SIV-proof/proof/ref-128/common.saw,c3d48b34470014b7f5ed9e9d4748d1e788e93552,// To little endian byte representation,True,toBytes : {n} (fin n) => [8 * n] -> [n][8]\nto...
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,ceef442edab01d684b21f326c9c65fd1eb9d980c,"// A litle endian bytes, to a bit vector.",True,fromBytes : {n} (fin n) => [n][8] -> [8 * n]\n...
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,27352c3f8e8a5db2e229750ff9c73666cb5e0c1e,"// Our algorithm is grouped by rows, but the C...",True,toState : [4][32] -> State\ntoState cols = tra...
3,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,3f2371e7338a838474a148d835d1291aaddb4fda,// Our state to the C represenation of the state.,True,fromState : State -> [4][32]\nfromState rows =...
4,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,aefdf86c5134797994323a3d6a54641ba572ced3,// Key schedule representation,True,flatSchedule : KeySchedule -> [44][32]\nflatSc...


In [None]:
hybrid_df.head()

Unnamed: 0,filename,filetype,content,variant
0,AES-GCM-SIV-proof/proof/ref-128/clmul_emulator...,saw,let {{\nmul_result : [64] -> [64] -> [2][64]\n...,hybrid
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,saw,let allocArg name ty = crucible_fresh_var name...,hybrid
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,saw,"let {{\n// Our algorithm is grouped by rows, b...",hybrid
3,AES-GCM-SIV-proof/proof/ref-128/proof.saw,saw,"include ""common.saw"";\n\n// C code that needs ...",hybrid
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...,hybrid


In [None]:
HYBRID_NAME = f"verified_hybrid_{VERSION}.jsonl"
COMMENT_STAT_PATH = f"data/GPTcomment_stats_{VERSION}.jsonl"

hybrid_df.to_json(OUTPUT_DIR / HYBRID_NAME, lines=True, orient="records")
comment_df.to_json(COMMENT_STAT_PATH, lines=True, orient="records")

