In [1]:
from pathlib import Path 
import os, dotenv
dotenv.load_dotenv()
os.chdir(Path(os.getenv("PYTHONPATH")).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from src.preprocessing.comment_process import normalize_blanklines, strip_cryptol_comments_all

In [3]:
jsonl_path = "data/training_datasets/verified_nomods.jsonl"

nomod_df = pd.read_json(jsonl_path, lines=True)

In [4]:
nomod_df.head()

Unnamed: 0,filename,filetype,content
0,AES-GCM-SIV-proof/proof/ref-128/clmul_emulator...,saw,let {{\nmul_result : [64] -> [64] -> [2][64]\n...
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,saw,let allocArg name ty = crucible_fresh_var name...
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,saw,"\nlet {{\n// Our algorithm is grouped by rows,..."
3,AES-GCM-SIV-proof/proof/ref-128/proof.saw,saw,"// Conveninet utilities\ninclude ""common.saw"";..."
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...


In [5]:
nocomment_df = nomod_df.copy()
nocomment_df["content"] = nocomment_df["content"].apply(strip_cryptol_comments_all)
nocomment_df["content"] = nocomment_df["content"].apply(normalize_blanklines)

In [6]:
nocomment_df.head(20)

Unnamed: 0,filename,filetype,content
0,AES-GCM-SIV-proof/proof/ref-128/clmul_emulator...,saw,let {{\nmul_result : [64] -> [64] -> [2][64]\n...
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,saw,let allocArg name ty = crucible_fresh_var name...
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,saw,let {{\ntoState : [4][32] -> State\ntoState co...
3,AES-GCM-SIV-proof/proof/ref-128/proof.saw,saw,"include ""common.saw"";\n\nc_code <- llvm_load_m..."
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...
5,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cry,module AES128 where\n\nimport `Common::AES\nim...
6,AES-GCM-SIV-proof/proof/cryptol-specs/intrinsi...,cry,module Intrinsics where\n\nimport `Common::AES...
7,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cry,module AES256 where\n\nimport `Common::AES\nim...
8,AES-GCM-SIV-proof/proof/cryptol-specs/TBox.cry,cry,type Nb = 4\ntype State = [4][Nb]...
9,AES-GCM-SIV-proof/proof/ref-256/GCM_SIV_c.saw,saw,AES_256_Key_Expansion <-\n proof\n proving...


In [7]:
nocomment_df.tail(20)

Unnamed: 0,filename,filetype,content
1310,saw-script/doc/rust-verification-with-saw/code...,saw,"enable_experimental;\n\nm <- mir_load_module ""..."
1311,saw-script/doc/rust-verification-with-saw/code...,saw,"enable_experimental;\n\nm <- mir_load_module ""..."
1312,saw-script/doc/rust-verification-with-saw/code...,saw,"enable_experimental;\n\nm <- mir_load_module ""..."
1313,saw-script/doc/rust-verification-with-saw/code...,saw,enable_experimental;\n\nlet times_two_spec = d...
1314,saw-script/doc/rust-verification-with-saw/code...,saw,"enable_experimental;\n\nm <- mir_load_module ""..."
1315,saw-script/doc/rust-verification-with-saw/code...,saw,"enable_experimental;\n\nm <- mir_load_module ""..."
1316,saw-script/doc/rust-verification-with-saw/code...,saw,enable_experimental;\n\nlet times_two_spec = d...
1317,saw-script/doc/rust-verification-with-saw/code...,saw,enable_experimental;\n\nlet flip_spec = do {\n...
1318,saw-script/doc/rust-verification-with-saw/code...,saw,enable_experimental;\n\nlet index_fail_spec = ...
1319,saw-script/doc/rust-verification-with-saw/code...,saw,enable_experimental;\n\nlet read_ref_spec = do...


In [8]:
nocomment_df["variant"] = "without_comments"
nocomment_df.head()

Unnamed: 0,filename,filetype,content,variant
0,AES-GCM-SIV-proof/proof/ref-128/clmul_emulator...,saw,let {{\nmul_result : [64] -> [64] -> [2][64]\n...,without_comments
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,saw,let allocArg name ty = crucible_fresh_var name...,without_comments
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,saw,let {{\ntoState : [4][32] -> State\ntoState co...,without_comments
3,AES-GCM-SIV-proof/proof/ref-128/proof.saw,saw,"include ""common.saw"";\n\nc_code <- llvm_load_m...",without_comments
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...,without_comments


In [9]:
nocomment_df.to_json("data/training_datasets/verified_nocomments.jsonl", orient="records", lines=True, force_ascii=False)


In [10]:
from src.preprocessing.comment_extractor import extract_strip_cry_comments
# Create Hybrid Dataset
out_path = Path("cache/GPT_comment_decisions_cache.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
DECISION_CACHE_PATH = "cache/GPT_comment_decisions_cache.jsonl"
comment_rows = []
dataset_rows = []
for index, row in nomod_df.iterrows():
    comments, file_record_ = extract_strip_cry_comments(
        filename=row.filename,
        content=row.content,
        llm_model_name="gpt-oss:20b",
        decision_cache_path=DECISION_CACHE_PATH
    )
    comment_rows.extend(comments)
    file_record = {
        "filename": file_record_["filename"],
        "filetype": row.filetype,
        "content": file_record_["content"],
        "variant": "hybrid"
    }
    dataset_rows.append(file_record)

comment_df = pd.DataFrame(comment_rows)
hybrid_df = pd.DataFrame(dataset_rows)



In [11]:
comment_df.head()

Unnamed: 0,filename,sha1,comment,keep,snippet
0,AES-GCM-SIV-proof/proof/ref-128/common.saw,c3d48b34470014b7f5ed9e9d4748d1e788e93552,// To little endian byte representation,True,toBytes : {n} (fin n) => [8 * n] -> [n][8]\nto...
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,ceef442edab01d684b21f326c9c65fd1eb9d980c,"// A litle endian bytes, to a bit vector.",True,fromBytes : {n} (fin n) => [n][8] -> [8 * n]\n...
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,27352c3f8e8a5db2e229750ff9c73666cb5e0c1e,"// Our algorithm is grouped by rows, but the C...",True,toState : [4][32] -> State\ntoState cols = tra...
3,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,3f2371e7338a838474a148d835d1291aaddb4fda,// Our state to the C represenation of the state.,True,fromState : State -> [4][32]\nfromState rows =...
4,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,aefdf86c5134797994323a3d6a54641ba572ced3,// Key schedule representation,True,flatSchedule : KeySchedule -> [44][32]\nflatSc...


In [12]:
hybrid_df.head()

Unnamed: 0,filename,filetype,content,variant
0,AES-GCM-SIV-proof/proof/ref-128/clmul_emulator...,saw,let {{\nmul_result : [64] -> [64] -> [2][64]\n...,hybrid
1,AES-GCM-SIV-proof/proof/ref-128/common.saw,saw,let allocArg name ty = crucible_fresh_var name...,hybrid
2,AES-GCM-SIV-proof/proof/ref-128/aes_emulation.saw,saw,"let {{\n// Our algorithm is grouped by rows, b...",hybrid
3,AES-GCM-SIV-proof/proof/ref-128/proof.saw,saw,"include ""common.saw"";\n\n// C code that needs ...",hybrid
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...,hybrid


In [13]:
HYBRID_PATH = "data/training_datasets/verified_hybrid.jsonl"
COMMENT_STAT_PATH = "data/GPTcomment_stats.jsonl"

hybrid_df.to_json(HYBRID_PATH, lines=True, orient="records")
comment_df.to_json(COMMENT_STAT_PATH, lines=True, orient="records")

