In [None]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from src.preprocessing.comment_process import normalize_blanklines, strip_cryptol_comments_all

In [None]:
VERSION = config["version"]

JSONL_PATH_ORIGINAL = Path(f"data/all_sources_verified_{VERSION}.jsonl")
JSONL_PATH_SLICES   = Path(f"data/some_slices_verified_{VERSION}.jsonl")

original_df = pd.read_json(JSONL_PATH_ORIGINAL, lines=True)

dfs = [original_df]

if JSONL_PATH_SLICES.exists():
    slices_df = pd.read_json(JSONL_PATH_SLICES, lines=True)

    prefix = "cryptol_slices"
    s = slices_df["filename"].astype(str)
    slices_df["filename"] = s.where(
        s.str.startswith(prefix + "/"),
        prefix + "/" + s.str.lstrip("/")
    )

    dfs.append(slices_df.loc[:, ["filename", "filetype", "content"]].copy())

nomod_df = pd.concat(dfs, ignore_index=True)

In [None]:
nomod_df.head()

In [None]:
nocomment_df = nomod_df.copy()
nocomment_df["content"] = nocomment_df["content"].apply(strip_cryptol_comments_all)
nocomment_df["content"] = nocomment_df["content"].apply(normalize_blanklines)

In [None]:
nocomment_df.head(20)

In [None]:
nocomment_df.tail(20)

In [None]:
nocomment_df["variant"] = "without_comments"
nocomment_df.head()

In [None]:

OUTPUT_DIR = Path("data/good_syntax")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

nocomment_df.to_json(
    OUTPUT_DIR / f"verified_nocomments_{VERSION}.jsonl", 
    orient="records", 
    lines=True, 
    force_ascii=False
    )


In [None]:
from src.preprocessing.comment_extractor import extract_strip_cry_comments
# Create Hybrid Dataset
out_path = Path("cache/GPT_comment_decisions_cache.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
DECISION_CACHE_PATH = "cache/GPT_comment_decisions_cache.jsonl"
comment_rows = []
dataset_rows = []
for index, row in nomod_df.iterrows():
    comments, file_record_ = extract_strip_cry_comments(
        filename=row.filename,
        content=row.content,
        llm_model_name="gpt-oss:20b",
        decision_cache_path=DECISION_CACHE_PATH
    )
    comment_rows.extend(comments)
    file_record = {
        "filename": file_record_["filename"],
        "filetype": row.filetype,
        "content": file_record_["content"],
        "variant": "hybrid"
    }
    dataset_rows.append(file_record)

comment_df = pd.DataFrame(comment_rows)
hybrid_df = pd.DataFrame(dataset_rows)



In [None]:
comment_df.head()

In [None]:
hybrid_df.head()

In [None]:
HYBRID_NAME = f"verified_hybrid_{VERSION}.jsonl"
COMMENT_STAT_PATH = f"data/GPTcomment_stats_{VERSION}.jsonl"

hybrid_df.to_json(OUTPUT_DIR / HYBRID_NAME, lines=True, orient="records")
comment_df.to_json(COMMENT_STAT_PATH, lines=True, orient="records")

