In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [3]:
VERSION = config["version"]

#jsonl_path = f"data/training_datasets/verified_{VARIATION}.jsonl"
#jsonl_path = "data/training_datasets/verified_nocomments.jsonl"

jsonl_path = f"data/all_sources_raw_{VERSION}.jsonl"

df = pd.read_json(jsonl_path, lines=True)
#df = df.iloc[ : 300]

In [4]:
df.head()

Unnamed: 0,filename,filetype,content
0,cryptol/examples/AES.cry,cry,// Cryptol AES Implementation\n// Copyright (c...
1,cryptol/examples/width.cry,cry,x : [8]\nx = length (252 : [8])\n
2,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([..."
3,cryptol/examples/AE.cry,cry,// WORK IN PROGRESS\n\n/*\nImplementation of t...
4,cryptol/examples/Cipher.cry,cry,"/*\n * Copyright (c) 2013-2016 Galois, Inc.\n ..."


In [5]:
from src.preprocessing.interpreter_process import verify_df_row_with_cryptol
from src.preprocessing.saw_subprocess import run_saw_script, load_saw_results

saw_results = []
MOUNT_DIR = config["mount_dir"]
dir_path = Path(MOUNT_DIR).expanduser()
dir_path.mkdir(parents=True, exist_ok=True)
SERVER_URL = config["cryptol_server_url"]
#print(verify_df_row_with_cryptol(df, 35, host_mount_dir=MOUNT_DIR, server_url=SERVER_URL))
rows = []
out_path = Path(f"data/training_datasets/syntax_check/cry_syntax_verification_results_{VERSION}.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
#results_df = load_saw_results(f"data/training_datasets/syntax_check/cry_syntax_verification_results_{VERSION}.jsonl")
for i in range(len(df)):
    if i % 10 == 0 and i > 0:
        print(f"Processed {i} of {len(df)} files")
    if df.iloc[i]["filetype"] == 'cry':
        repl_result = verify_df_row_with_cryptol(
                df, i, host_mount_dir=MOUNT_DIR, server_url=SERVER_URL)
        rows.append({"filename" : df.iloc[i]["filename"], **repl_result["load_info"]})
    '''
    else:
        if df.iloc[i]["filename"] in results_df['filename'].values:
            result_ = results_df[results_df['filename'] == df.iloc[i]["filename"]]
            rows.append({"filename" : df.iloc[i]["filename"], "load_ok": result_["load_ok"].values[0], "error": result_["error"].values[0] if not result_["load_ok"].values[0] else None, "file_deps": ["SAW"]})
        else:
            rows.append({"filename" : df.iloc[i]["filename"], "load_ok": False, "error": "saw", "file_deps": ["SAW"]})
            continue
            saw_result = run_saw_script(f"{os.getenv("REPO_ROOT")}/{df.iloc[i]['filename']}")
            row = {"filename": df.iloc[i]["filename"], **saw_result}
            results_df = pd.concat([results_df, pd.DataFrame([row])], ignore_index=True)
            #saw_results.append({"filename": df.iloc[i]["filename"], **saw_result})
            if saw_result["returncode"] == 0:
                rows.append({"filename" : df.iloc[i]["filename"], "load_ok": True, "error": None, "file_deps": ["SAW"]})
            else:
                rows.append({"filename" : df.iloc[i]["filename"], "load_ok": False, "error": "saw", "file_deps": ["SAW"]})
    '''
    
results_df = pd.DataFrame(rows)
results_df.to_json("data/training_datasets/syntax_check/cry_saw_syntax_verification_results.jsonl", lines=True, orient="records")

Processed 10 of 797 files
Processed 20 of 797 files
Processed 30 of 797 files
Processed 40 of 797 files
Processed 50 of 797 files
Processed 60 of 797 files
Processed 70 of 797 files
Processed 80 of 797 files
Processed 90 of 797 files
Processed 100 of 797 files
Processed 110 of 797 files
Processed 120 of 797 files
Processed 130 of 797 files
Processed 140 of 797 files
Processed 150 of 797 files
Processed 160 of 797 files
Processed 170 of 797 files
Processed 180 of 797 files
Processed 190 of 797 files
Processed 200 of 797 files
Processed 210 of 797 files
Processed 220 of 797 files
Processed 230 of 797 files
Processed 240 of 797 files
Processed 250 of 797 files
Processed 260 of 797 files
Processed 270 of 797 files
Processed 280 of 797 files
Processed 290 of 797 files
Processed 300 of 797 files
Processed 310 of 797 files
Processed 320 of 797 files
Processed 330 of 797 files
Processed 340 of 797 files
Processed 350 of 797 files
Processed 360 of 797 files
Processed 370 of 797 files
Processed 

In [6]:
s = results_df["filename"].astype("string").str.strip("/").str.replace("\\", "/", regex=False)

# default: first path segment
repo = s.str.split("/", n=2).str[0]

# if it starts with cryptol_slices/, take second segment
mask = s.str.startswith("cryptol_slices/")
repo.loc[mask] = s.loc[mask].str.split("/", n=3).str[1]

results_df["repo"] = repo


results_df.head()

Unnamed: 0,filename,load_ok,file,error,file_deps,repo
0,cryptol/examples/AES.cry,True,files/cryptol/examples/AES_nduy6_h1.cry,,[Cryptol],cryptol
1,cryptol/examples/width.cry,True,files/cryptol/examples/width_4sezczew.cry,,[Cryptol],cryptol
2,cryptol/examples/splitAt.cry,True,files/cryptol/examples/splitAt_lvtxbsed.cry,,[Cryptol],cryptol
3,cryptol/examples/AE.cry,True,files/cryptol/examples/AE_9ghr2v_2.cry,,"[AE__parameter, Cryptol]",cryptol
4,cryptol/examples/Cipher.cry,True,files/cryptol/examples/Cipher_34957ly2.cry,,[Cryptol],cryptol


In [7]:
repl_results_df = pd.DataFrame(rows)
repl_results_df = repl_results_df.join(df[["filename", "filetype"]].set_index('filename'), on='filename', how='left', rsuffix='_orig')
repl_results_df["repo"] = repl_results_df["filename"].apply(lambda x: x.split('/')[0])
passed = repl_results_df[repl_results_df["load_ok"] == True]
for repo in passed['repo'].unique():
    print(f"Repo: {repo}, Files: {len(passed[passed['repo'] == repo])}")

Repo: cryptol, Files: 415
Repo: cryptol-specs, Files: 255


In [8]:
failed = repl_results_df[repl_results_df["load_ok"] == False]
for repo in failed['repo'].unique():
    print(f"Repo: {repo}, Files: {len(failed[failed['repo'] == repo])}")

Repo: cryptol, Files: 127


In [9]:
lines = []

for _, row in failed.iterrows():
    lines.append(str(row["filename"]))   # ensure it's a string

# newline-separated string
filenames_str = "\n".join(lines)

# optional: write to a text file
with open("syntax_verification_failures.txt", "w", encoding="utf-8") as f:
    f.write(filenames_str)


In [10]:
pass_df = df[df['filename'].isin(passed['filename'])]
pass_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 670 entries, 0 to 796
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  670 non-null    object
 1   filetype  670 non-null    object
 2   content   670 non-null    object
dtypes: object(3)
memory usage: 20.9+ KB


In [11]:
pass_df.to_json(f"data/all_sources_verified_{VERSION}.jsonl", lines=True, orient="records")

In [12]:
print(len(repl_results_df[(repl_results_df["load_ok"] == True) & (repl_results_df["filetype"] == "cry")]))
print(len(repl_results_df[(repl_results_df["load_ok"] == False) & (repl_results_df["filetype"] == "cry")]))

670
127


In [13]:
print(len(repl_results_df[repl_results_df["load_ok"] == True]))
print(len(repl_results_df[repl_results_df["load_ok"] == False]))

670
127
