In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [3]:
VERSION = config["version"]

#jsonl_path = f"data/training_datasets/verified_{VARIATION}.jsonl"
#jsonl_path = "data/training_datasets/verified_nocomments.jsonl"

jsonl_path = f"data/all_sources_raw_{VERSION}.jsonl"

df = pd.read_json(jsonl_path, lines=True)


In [4]:
df.head()

Unnamed: 0,filename,filetype,content
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,cry,module AES where\n\nimport `Common::AES\n\ntyp...
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,cry,// 128-bit AES\nmodule AES128 where\n\nimport ...
2,AES-GCM-SIV-proof/proof/cryptol-specs/GCM_SIV_...,cry,module GCM_SIV_128 = Common::AES_GCM_SIV where...
3,AES-GCM-SIV-proof/proof/cryptol-specs/intrinsi...,cry,module Intrinsics where\n\nimport `Common::AES...
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,cry,// 256-bit AES\nmodule AES256 where\n\nimport ...


In [5]:
from src.preprocessing.interpreter_process import verify_df_row_with_cryptol
from src.preprocessing.saw_subprocess import run_saw_script, load_saw_results

saw_results = []

MOUNT_DIR = Path(config["mount_dir"]).expanduser()
MOUNT_DIR.mkdir(parents=True, exist_ok=True)
SERVER_URL = config["cryptol_server_url"]
#print(verify_df_row_with_cryptol(df, 35, host_mount_dir=MOUNT_DIR, server_url=SERVER_URL))
rows = []
out_path = Path(f"data/syntax_check_results/cry_syntax_verification_results_{VERSION}.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
#results_df = load_saw_results(f"data/training_datasets/syntax_check/cry_syntax_verification_results_{VERSION}.jsonl")
for i in range(len(df)):
    if i % 10 == 0 and i > 0:
        print(f"Processed {i} of {len(df)} files")
    if df.iloc[i]["filetype"] == 'cry':
        repl_result = verify_df_row_with_cryptol(
                df, i, host_mount_dir=MOUNT_DIR, server_url=SERVER_URL)
        rows.append({"filename" : df.iloc[i]["filename"], **repl_result["load_info"]})
    '''
    else:
        if df.iloc[i]["filename"] in results_df['filename'].values:
            result_ = results_df[results_df['filename'] == df.iloc[i]["filename"]]
            rows.append({"filename" : df.iloc[i]["filename"], "load_ok": result_["load_ok"].values[0], "error": result_["error"].values[0] if not result_["load_ok"].values[0] else None, "file_deps": ["SAW"]})
        else:
            rows.append({"filename" : df.iloc[i]["filename"], "load_ok": False, "error": "saw", "file_deps": ["SAW"]})
            continue
            saw_result = run_saw_script(f"{os.getenv("REPO_ROOT")}/{df.iloc[i]['filename']}")
            row = {"filename": df.iloc[i]["filename"], **saw_result}
            results_df = pd.concat([results_df, pd.DataFrame([row])], ignore_index=True)
            #saw_results.append({"filename": df.iloc[i]["filename"], **saw_result})
            if saw_result["returncode"] == 0:
                rows.append({"filename" : df.iloc[i]["filename"], "load_ok": True, "error": None, "file_deps": ["SAW"]})
            else:
                rows.append({"filename" : df.iloc[i]["filename"], "load_ok": False, "error": "saw", "file_deps": ["SAW"]})
    '''
    
results_df = pd.DataFrame(rows)
results_df.to_json(out_path, lines=True, orient="records")

Processed 10 of 1366 files
Processed 20 of 1366 files
Processed 30 of 1366 files
Processed 40 of 1366 files
Processed 50 of 1366 files
Processed 60 of 1366 files
Processed 70 of 1366 files
Processed 80 of 1366 files
Processed 90 of 1366 files
Processed 100 of 1366 files
Processed 110 of 1366 files
Processed 120 of 1366 files
Processed 130 of 1366 files
Processed 140 of 1366 files
Processed 150 of 1366 files
Processed 160 of 1366 files
Processed 170 of 1366 files
Processed 180 of 1366 files
Processed 190 of 1366 files
Processed 200 of 1366 files
Processed 210 of 1366 files
Processed 220 of 1366 files
Processed 230 of 1366 files
Processed 240 of 1366 files
Processed 250 of 1366 files
Processed 260 of 1366 files
Processed 270 of 1366 files
Processed 280 of 1366 files
Processed 290 of 1366 files
Processed 300 of 1366 files
Processed 310 of 1366 files
Processed 320 of 1366 files
Processed 330 of 1366 files
Processed 340 of 1366 files
Processed 350 of 1366 files
Processed 360 of 1366 files
P

In [6]:
s = results_df["filename"].astype("string").str.strip("/").str.replace("\\", "/", regex=False)

# default: first path segment
repo = s.str.split("/", n=2).str[0]

# if it starts with cryptol_slices/, take second segment
mask = s.str.startswith("cryptol_slices/")
repo.loc[mask] = s.loc[mask].str.split("/", n=3).str[1]

results_df["repo"] = repo


results_df.head()

Unnamed: 0,filename,load_ok,file,error,file_deps,repo
0,AES-GCM-SIV-proof/proof/cryptol-specs/AES.cry,True,files/AES-GCM-SIV-proof/proof/cryptol-specs/AE...,,"[Common::AES, Cryptol]",AES-GCM-SIV-proof
1,AES-GCM-SIV-proof/proof/cryptol-specs/AES128.cry,True,files/AES-GCM-SIV-proof/proof/cryptol-specs/AE...,,"[AES, Common::AES, Cryptol]",AES-GCM-SIV-proof
2,AES-GCM-SIV-proof/proof/cryptol-specs/GCM_SIV_...,True,files/AES-GCM-SIV-proof/proof/cryptol-specs/GC...,,"[Common::AES_GCM_SIV, Cryptol, GCM_SIV_128__wh...",AES-GCM-SIV-proof
3,AES-GCM-SIV-proof/proof/cryptol-specs/intrinsi...,False,files/AES-GCM-SIV-proof/proof/cryptol-specs/in...,Parse error at files/AES-GCM-SIV-proof/proof/c...,,AES-GCM-SIV-proof
4,AES-GCM-SIV-proof/proof/cryptol-specs/AES256.cry,True,files/AES-GCM-SIV-proof/proof/cryptol-specs/AE...,,"[AES, Common::AES, Cryptol]",AES-GCM-SIV-proof


In [7]:
repl_results_df = pd.DataFrame(rows)
repl_results_df = repl_results_df.join(df[["filename", "filetype"]].set_index('filename'), on='filename', how='left', rsuffix='_orig')
repl_results_df["repo"] = repl_results_df["filename"].apply(lambda x: x.split('/')[0])
passed = repl_results_df[repl_results_df["load_ok"] == True]
for repo in passed['repo'].unique():
    print(f"Repo: {repo}, Files: {len(passed[passed['repo'] == repo])}")

Repo: AES-GCM-SIV-proof, Files: 13
Repo: aws-lc-verification, Files: 217
Repo: BLST-Verification, Files: 21
Repo: ckzg-eip-4844-verification, Files: 4
Repo: cryptol, Files: 415
Repo: cryptol-specs, Files: 255
Repo: formal-verso, Files: 8
Repo: saw-script, Files: 83


In [8]:
failed = repl_results_df[repl_results_df["load_ok"] == False]
for repo in failed['repo'].unique():
    print(f"Repo: {repo}, Files: {len(failed[failed['repo'] == repo])}")

Repo: AES-GCM-SIV-proof, Files: 2
Repo: aws-lc-verification, Files: 127
Repo: BLST-Verification, Files: 40
Repo: ckzg-eip-4844-verification, Files: 16
Repo: cryptol, Files: 127
Repo: formal-verso, Files: 3
Repo: saw-script, Files: 35


In [9]:
lines = []

for _, row in failed.iterrows():
    lines.append(str(row["filename"]))   # ensure it's a string

# newline-separated string
filenames_str = "\n".join(lines)

# optional: write to a text file
with open("syntax_verification_failures.txt", "w", encoding="utf-8") as f:
    f.write(filenames_str)


In [10]:
pass_df = df[df['filename'].isin(passed['filename'])]
pass_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1016 entries, 0 to 1365
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  1016 non-null   object
 1   filetype  1016 non-null   object
 2   content   1016 non-null   object
dtypes: object(3)
memory usage: 31.8+ KB


In [11]:
pass_df.to_json(f"data/all_sources_verified_{VERSION}.jsonl", lines=True, orient="records")

In [12]:
print(len(repl_results_df[(repl_results_df["load_ok"] == True) & (repl_results_df["filetype"] == "cry")]))
print(len(repl_results_df[(repl_results_df["load_ok"] == False) & (repl_results_df["filetype"] == "cry")]))

1016
350


In [13]:
print(len(repl_results_df[repl_results_df["load_ok"] == True]))
print(len(repl_results_df[repl_results_df["load_ok"] == False]))

1016
350
