# Create Meta Data for Cryptol training examples

## McCabe Cyclomatic Complexity
The json files being used to compute the MCC are in the following format:

```json
{
    "definitions": [
        {
            "kind": "",
            "locals": [],
            "mcc": [] | null,
            "name": "",
            "params": [],
            "references": [],
            "signature": null
        }
    ],
    "imports": [],
    "filename": ""
}
```
This notebook creates the json above for each training example, creates a dataframe to store the json path, filename, and MCC summary statistics for each training example.

In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [3]:
import subprocess
import pandas as pd

VERSION = config["version"]
REPO_ROOT_DIR = Path(config["repo_root"]).expanduser()

MCC_ROOT = config["mcc_root"]
MCC_ROOT_DIR = Path(MCC_ROOT).expanduser()
MCC_ROOT_DIR.mkdir(parents=True, exist_ok=True)

clean_data_df = pd.read_json(
    "data/clean_datasets/verified_nocomments_v1-0.jsonl", 
    lines=True
    )

CABAL_PROJECT_ROOT = "toy-cryptol-ast" 

clean_data_df.head()

Unnamed: 0,filename,filetype,content,variant
0,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([...",without_comments
1,cryptol/examples/AE.cry,cry,module AE where\n\nparameter\n type A : * ...,without_comments
2,cryptol/examples/Cipher.cry,cry,module Cipher where\n\ntype Cipher KeySize Blo...,without_comments
3,cryptol/examples/xor_cipher.cry,cry,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...,without_comments
4,cryptol/examples/zero_weird.cry,cry,x : {a}() => a -> [16]\nx v = zero v \n\nprope...,without_comments


In [4]:
def run_haskell_mcc_generator(cry_path: Path):
    """
    Call: cabal run toy-cryptol-ast -- <file> <OUTDIR>
    from within CABAL_PROJECT_ROOT.
    """

    cmd = [
        "cabal",
        "run",
        "cryptol-meta",
        "--",
        str(cry_path),
    ]

    print("Running:", " ".join(cmd))
    result = subprocess.run(
        cmd,
        cwd=str(CABAL_PROJECT_ROOT),  # run inside the cabal project
        capture_output=True,
        text=True,
    )

    if result.returncode != 0:
        print(f"❌ Error ({result.returncode}) on {cry_path}")
        if result.stdout.strip():
            print("STDOUT:\n", result.stdout)
        if result.stderr.strip():
            print("STDERR:\n", result.stderr)
    else:
        print(f"✅ OK: {cry_path}")
        if result.stdout.strip():
            print("STDOUT:\n", result.stdout)
    return result


In [5]:
from src.preprocessing.mcc_tools import summarize_file_obj

clean_data_df = clean_data_df.iloc[ -10 : ]
meta_rows = []

for index, row in clean_data_df.iterrows():
    cry_path = Path(row["filename"])
    json_path = cry_path.parent / f"{cry_path.stem}.json"
    output_file = MCC_ROOT_DIR / json_path
    output_file.parent.mkdir(parents=True, exist_ok=True)
    res = run_haskell_mcc_generator(REPO_ROOT_DIR / cry_path)
    mcc = json.loads(res.stdout)
    output_file.write_text(
        json.dumps(mcc, indent=2),
        encoding="utf-8"
        )
    summary = summarize_file_obj(mcc)
    meta_rows.append(
        {
            "filename": row["filename"],
            "json_path": str(json_path),
            **summary,
        }
    )

meta_df = pd.DataFrame(meta_rows)

Running: cabal run cryptol-meta -- /Users/josh/Automated_Reasoning_for_Cryptography/cryptol_slices/cryptol/lib/PrimeEC/006_ec_affinify.cry
✅ OK: /Users/josh/Automated_Reasoning_for_Cryptography/cryptol_slices/cryptol/lib/PrimeEC/006_ec_affinify.cry
STDOUT:
 {"definitions":[{"kind":"type","locals":[],"mcc":null,"name":"AffinePoint","params":[],"references":[],"signature":null},{"kind":"type","locals":[],"mcc":null,"name":"ProjectivePoint","params":[],"references":[],"signature":null},{"kind":"declaration","locals":["R","lambda"],"mcc":{"edges":[{"from":1,"kind":"decl","label":"declares","to":2},{"from":1,"kind":"decl","label":"declares","to":3},{"from":4,"kind":"control","label":null,"to":5},{"from":5,"kind":"control","label":null,"to":6},{"from":6,"kind":"control","label":null,"to":7},{"from":8,"kind":"control","label":null,"to":9},{"from":9,"kind":"control","label":null,"to":10},{"from":7,"kind":"branch","label":"then","to":8},{"from":7,"kind":"branch","label":"else","to":11},{"from":

In [6]:
meta_df.head()

Unnamed: 0,filename,json_path,imports,imports_count,num_definitions,num_declarations,num_types,num_declarations_with_mcc,total_mcc,max_mcc,avg_mcc
0,cryptol_slices/cryptol/lib/PrimeEC/006_ec_affi...,cryptol_slices/cryptol/lib/PrimeEC/006_ec_affi...,[],0,5,3,2,3,4,2,1.333333
1,cryptol_slices/cryptol/lib/PrimeEC/005_ec_proj...,cryptol_slices/cryptol/lib/PrimeEC/005_ec_proj...,[],0,3,1,2,1,1,1,1.0
2,cryptol_slices/cryptol/lib/PrimeEC/001_ec_is_p...,cryptol_slices/cryptol/lib/PrimeEC/001_ec_is_p...,[],0,2,1,1,1,1,1,1.0
3,cryptol_slices/cryptol/lib/PrimeEC/004_ec_equa...,cryptol_slices/cryptol/lib/PrimeEC/004_ec_equa...,[],0,6,4,2,4,5,2,1.25
4,cryptol_slices/cryptol/lib/SuiteB/030_sha2bloc...,cryptol_slices/cryptol/lib/SuiteB/030_sha2bloc...,[],0,5,2,3,2,3,2,1.5
