In [1]:
from pathlib import Path 
import os, dotenv, yaml

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

dotenv.load_dotenv()
os.chdir(Path(config["pythonpath"]).expanduser())

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
from src.preprocessing.quality_process import compute_file_metrics

In [3]:
import json
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer

VERSION = config["version"]
VARIATION = "nocomments"

DATA = f"data/good_syntax/verified_{VARIATION}_{VERSION}.jsonl"

MODEL_ID = config["model_id"]

codellama_tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    use_fast=True,
    trust_remote_code=False,
)

if codellama_tokenizer.pad_token is None:
    codellama_tokenizer.pad_token = codellama_tokenizer.eos_token

# from quality_process import compute_file_metrics  # <-- uncomment and fix path

results = []
with open(DATA, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Scoring rows"):
        row = json.loads(line)
        results.append(
            compute_file_metrics(
                row["filename"],
                row["content"],
                model_tokenizer=codellama_tokenizer,
            )
        )


df = pd.DataFrame(results)


  from .autonotebook import tqdm as notebook_tqdm
Scoring rows: 0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2969709 > 32768). Running this sequence through the model will result in indexing errors
Scoring rows: 15575it [02:22, 109.15it/s] 


In [4]:
df.head()

Unnamed: 0,filename,sha1,bytes,lines,avg_line_len,max_line_len,non_ascii_ratio,binary_like,enc_total_matched,enc_max_run,enc_fraction,enc_hits_base64,enc_hits_hexbytes,enc_hits_unicode,num_tokens_lang,k_shingle,num_shingles,hexnum_ratio,num_tokens_model,junk_path
0,cryptol/examples/AES.cry,7377951555236bde3d2efa553b85d881a12edbe0,7555,210,34.98,76,0.0,False,0,0,0.0,0,0,0,1969,5,1965,0.003047,3705,False
1,cryptol/examples/width.cry,6fcd12515c7bdf141d925f4c06dfa311b96c353d,31,3,9.67,22,0.0,False,0,0,0.0,0,0,0,15,5,11,0.0,16,False
2,cryptol/examples/splitAt.cry,ceff327d84ba2bf46417e0a18cccc22b0ab11452,107,9,11.0,33,0.0,False,0,0,0.0,0,0,0,68,5,64,0.0,69,False
3,cryptol/examples/AE.cry,a9d7ec433d684f239b3565d4d97aedaea4615add,1442,52,26.75,108,0.0,False,0,0,0.0,0,0,0,446,5,442,0.0,490,False
4,cryptol/examples/Cipher.cry,2e04dc8c6f7b59d8929247ffc9b72ee3982a68fd,165,7,22.71,53,0.0,False,0,0,0.0,0,0,0,37,5,33,0.0,45,False


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15575 entries, 0 to 15574
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   filename           15575 non-null  object 
 1   sha1               15575 non-null  object 
 2   bytes              15575 non-null  int64  
 3   lines              15575 non-null  int64  
 4   avg_line_len       15575 non-null  float64
 5   max_line_len       15575 non-null  int64  
 6   non_ascii_ratio    15575 non-null  float64
 7   binary_like        15575 non-null  bool   
 8   enc_total_matched  15575 non-null  int64  
 9   enc_max_run        15575 non-null  int64  
 10  enc_fraction       15575 non-null  float64
 11  enc_hits_base64    15575 non-null  int64  
 12  enc_hits_hexbytes  15575 non-null  int64  
 13  enc_hits_unicode   15575 non-null  int64  
 14  num_tokens_lang    15575 non-null  int64  
 15  k_shingle          15575 non-null  int64  
 16  num_shingles       155

In [6]:
# StarCoder-like thresholds (tune if needed)
MAX_BYTES         = 200_000
MAX_NONASCII      = 0.20
ENC_MAX_RUN_CHARS = 1024
ENC_MAX_FRACTION  = 0.50
MAX_LINES_TOTAL   = 100_000
MAX_LINE_AVG_LEN  = 100
MAX_LINE_MAX_LEN  = 1_000
MIN_TOKENS_LANG   = 40      # language-token gate (Cryptol tokenizer)
MAX_TOKENS_LANG   = 10_000  # optional upper bound
MIN_TOKENS_MODEL  = 40      # only if youâ€™ve populated num_tokens_model
MAX_TOKENS_MODEL  = 1600
MAX_HEXNUM_RATIO  = 0.20


# --- exact dedup (keep first occurrence of each sha1) ---
# mark duplicates (True means "is duplicate" => drop later)
dup_mask = df.duplicated(subset=["sha1"], keep="first")

# --- encoded data (StarCoder) ---
enc_mask = (df["enc_max_run"] > ENC_MAX_RUN_CHARS) | (df["enc_fraction"] > ENC_MAX_FRACTION)

# --- long-line filters (StarCoder) ---
longline_mask = (
    (df["lines"] > MAX_LINES_TOTAL) |
    (df["avg_line_len"] > MAX_LINE_AVG_LEN) |
    (df["max_line_len"] > MAX_LINE_MAX_LEN)
)

# --- binary-like content ---
binary_mask = df["binary_like"].fillna(False)

# --- non-ascii density ---
nonascii_mask = df["non_ascii_ratio"].fillna(0) > MAX_NONASCII

# --- size guardrail (bytes) ---
bytes_mask = df["bytes"].fillna(0) > MAX_BYTES

# --- language-token bounds ---
lang_small_mask = df["num_tokens_model"].fillna(0) < MIN_TOKENS_MODEL
lang_large_mask = df["num_tokens_model"].fillna(0) > MAX_TOKENS_MODEL

# --- shingles exist (needed for Jaccard) ---
no_shingles_mask = df["num_shingles"].fillna(0) <= 0

# --- numeric/hex blob concentration ---
hexnum_mask = df["hexnum_ratio"].fillna(0) > MAX_HEXNUM_RATIO

# --- model-token gate (only apply where available) ---
if "num_tokens_model" in df.columns:
    model_small_mask = df["num_tokens_model"].fillna(np.inf) < MIN_TOKENS_MODEL
else:
    model_small_mask = pd.Series(False, index=df.index)


In [7]:
# Combine all hard-drop reasons
drop_mask = (
    dup_mask |
    enc_mask |
    longline_mask |
    binary_mask |
    nonascii_mask |
    bytes_mask |
    lang_small_mask |
    lang_large_mask |
    no_shingles_mask |
    hexnum_mask |
    model_small_mask
)

# Optional: compute a human-readable fail reason (first rule that tripped)
def first_reason(i):
    if dup_mask.iat[i]:          return "exact_duplicate"
    if enc_mask.iat[i]:          return "encoded_data"
    if longline_mask.iat[i]:     return "long_lines"
    if binary_mask.iat[i]:       return "binary_like"
    if nonascii_mask.iat[i]:     return "too_much_nonascii"
    if bytes_mask.iat[i]:        return "too_large_bytes"
    if lang_small_mask.iat[i]:   return "too_few_lang_tokens"
    if lang_large_mask.iat[i]:   return "too_many_lang_tokens"
    if no_shingles_mask.iat[i]:  return "no_shingles"
    if hexnum_mask.iat[i]:       return "hexnum_blob"
    if model_small_mask.iat[i]:  return "too_few_model_tokens"
    return "ok"

df = df.copy()
df["quality_ok"] = ~drop_mask
df["fail_reason"] = [first_reason(i) for i in range(len(df))]


In [8]:
dedup_cols = [
    "filename", "sha1",
    # size/lines
    "bytes", "lines", "avg_line_len", "max_line_len",
    # content/encoding
    "non_ascii_ratio", "binary_like",
    "enc_total_matched", "enc_max_run", "enc_fraction",
    "enc_hits_base64", "enc_hits_hexbytes", "enc_hits_unicode",
    # tokens/shingles
    "num_tokens_lang", "k_shingle", "num_shingles", "hexnum_ratio",
    # model tokens (optional)
    "num_tokens_model",
    # path heuristic & status
    "quality_ok", "fail_reason",
]

candidate_df = df.loc[df["quality_ok"], dedup_cols].reset_index(drop=True)
similar_process_df = pd.read_json(DATA, lines=True)
similar_process_df = similar_process_df[similar_process_df['filename'].isin(candidate_df['filename'])].reset_index(drop=True)

put_back_path = Path("data/dropped/files_to_put_back.csv")
'''
if put_back_path.exists():
    put_back_set = pd.read_csv(put_back_path)
    put_back_filenames = set(put_back_set["filename"].dropna().tolist())
else:
    put_back_filenames = set()

for fname in put_back_filenames:
    if fname in df['filename'].values:
        candidate_df = pd.concat([candidate_df, df[df['filename'] == fname][dedup_cols]], ignore_index=True)
        candidate_df.loc[candidate_df['filename'] == fname, 'quality_ok'] = True
        '''
similar_process_df.head()

Unnamed: 0,filename,filetype,content,variant
0,cryptol/examples/splitAt.cry,cry,"x = [1,2,3,4] : [_][8]\n\ny = (splitAt x) : ([...",without_comments
1,cryptol/examples/AE.cry,cry,module AE where\n\nparameter\n type A : * ...,without_comments
2,cryptol/examples/Cipher.cry,cry,module Cipher where\n\ntype Cipher KeySize Blo...,without_comments
3,cryptol/examples/xor_cipher.cry,cry,encrypt : {a}(fin a) => [8] -> [a][8] -> [a][8...,without_comments
4,cryptol/examples/zero_weird.cry,cry,x : {a}() => a -> [16]\nx v = zero v \n\nprope...,without_comments


In [9]:
print("[summary] total:", len(df))
print("[summary] kept :", int(df["quality_ok"].sum()))
print("[summary] dropped:", int((~df["quality_ok"]).sum()))
print("[summary] drop reasons:")
print(df.loc[~df["quality_ok"], "fail_reason"].value_counts())


[summary] total: 15575
[summary] kept : 6627
[summary] dropped: 8948
[summary] drop reasons:
fail_reason
encoded_data            8200
too_few_lang_tokens      370
exact_duplicate          174
too_many_lang_tokens     171
hexnum_blob               27
no_shingles                3
long_lines                 2
too_much_nonascii          1
Name: count, dtype: int64


In [10]:
dropped = df[df["quality_ok"] == False].copy().reset_index(drop=True)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15575 entries, 0 to 15574
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   filename           15575 non-null  object 
 1   sha1               15575 non-null  object 
 2   bytes              15575 non-null  int64  
 3   lines              15575 non-null  int64  
 4   avg_line_len       15575 non-null  float64
 5   max_line_len       15575 non-null  int64  
 6   non_ascii_ratio    15575 non-null  float64
 7   binary_like        15575 non-null  bool   
 8   enc_total_matched  15575 non-null  int64  
 9   enc_max_run        15575 non-null  int64  
 10  enc_fraction       15575 non-null  float64
 11  enc_hits_base64    15575 non-null  int64  
 12  enc_hits_hexbytes  15575 non-null  int64  
 13  enc_hits_unicode   15575 non-null  int64  
 14  num_tokens_lang    15575 non-null  int64  
 15  k_shingle          15575 non-null  int64  
 16  num_shingles       155

In [11]:
#review_data_set = df[~df['fail_reason'].isin(['ok', 'exact_duplicate'])].copy().reset_index(drop=True)
#out_path = Path(f"data/dropped/review_files_{VARIATION}_{VERSION}.csv")
#out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
#review_data_set.to_csv(out_path, index=False)


In [12]:
from src.preprocessing.similiar_process import run_from_dataframe

# candidate_df must have an absolute-path 'filename' column.
df_files, df_pairs, similar_files = run_from_dataframe(
    similar_process_df,
    filename_col="filename",
    content_col="content",
    out_dir="minhash_outputs",
)


[info] ==== Starting MinHash/LSH over DataFrame ====
[info] params: K_SHINGLE=5, NUM_PERM=512, LSH_THRESHOLD=0.7
[info] loaded 6627 files from candidate_df
[info] files indexed   : 6627
[diag] total candidate pairs: 7430
[diag] pairs with jaccard >= 0.7: 462
[info] wrote CSV and Parquet to minhash_outputs/

[info] ==== MinHash/LSH run summary ====
[info] files loaded  : 6627
[info] files indexed : 6627
[info] files with 0 shingles (tokens < 5): 0
[info] candidate pairs (from LSH) : 7430
[info] pairs with Jaccard >= 0.60: 534
[info] pairs with Jaccard >= 0.70: 462
[info] pairs with Jaccard >= 0.80: 271
[info] pairs with Jaccard >= 0.85: 192
[info] pairs with Jaccard >= 0.90: 134
[info] avg Jaccard (candidates)  : 0.5121
[info] max Jaccard               : 1.0000
[info] min Jaccard               : 0.3478

[info] top pairs:
                                                                                          a                                                                             

In [13]:
df_files.head()

Unnamed: 0,filename,num_tokens,num_shingles,num_perm,k_shingle,minhash_hashvalues
0,cryptol-specs/Common/EC/ECInterface.cry,112,101,512,5,"[51678339, 183006, 41776937, 23555261, 5225536..."
1,cryptol-specs/Common/EC/PrimeField/Instantiati...,39,33,512,5,"[226173601, 154546418, 86400395, 31786271, 488..."
2,cryptol-specs/Common/EC/PrimeField/Instantiati...,39,33,512,5,"[66091275, 158043833, 51583753, 31786271, 2174..."
3,cryptol-specs/Common/EC/PrimeField/Tests/P192.cry,265,215,512,5,"[16039649, 5428130, 3072712, 29676009, 681719,..."
4,cryptol-specs/Common/EC/PrimeField/Tests/P224.cry,220,180,512,5,"[61268182, 1429633, 1009008, 29676009, 2043097..."


In [14]:
from src.preprocessing.cluster_process import run_clustering

# If you already have df_files/df_pairs in memory:
df_keep, df_drop, df_clusters = run_clustering(
    df_files=df_files,          # from similiar_process
    df_pairs=df_pairs,          # from similiar_process
    jaccard_keep_threshold=0.70,
    out_dir="minhash_outputs",
    content_lookup=None,        # or {filename: raw_text} if you want text-derived penalties
    save_outputs=True
)

[info] clusters formed   : 6351
[info] kept files        : 6351
[info] dropped files     : 276
[info] wrote keep/drop/cluster CSVs to minhash_outputs/


In [15]:
dataset = df[df['filename'].isin(df_keep['filename'].tolist())].copy()
df.loc[
    df['filename'].isin(df_drop['filename'].tolist()),
    'quality_ok'
    ] = False
df.loc[
    df['filename'].isin(df_drop['filename'].tolist()),
    'fail_reason'
    ] = 'similiar_file_exists'

dropped = df[~df['filename'].isin(dataset['filename'].tolist())].copy().reset_index(drop=True)


In [16]:
#dropped.to_csv(f"data/dropped/{VARIATION}_dropped_files_{VERSION}.csv", index=False)

In [21]:
import re

verified_files = set(dataset["filename"].unique())

#all_files_df = pd.read_json(DATA1, lines=True)
all_files_filtered_df = similar_process_df[similar_process_df["filename"].isin(verified_files)].reset_index(drop=True)

pattern = r"/\d{3}_testCase"   # slash + exactly 3 digits + "_testCase"
mask = all_files_filtered_df["filename"].astype(str).str.contains(pattern, regex=True, na=False)

test_case_df = all_files_filtered_df[mask].copy().reset_index(drop=True)
all_files_filtered_df = all_files_filtered_df[~mask].reset_index(drop=True)

mask = all_files_filtered_df["filename"].astype(str).str.contains(r"/tests/", regex=True, na=False)
all_files_filtered_df = all_files_filtered_df[~mask]

out_path = Path(f"data/clean_datasets/verified_{VARIATION}_{VERSION}.jsonl")
out_path.parent.mkdir(parents=True, exist_ok=True)  # create dirs if missing
all_files_filtered_df.to_json(out_path, lines=True, orient="records")
test_case_df.to_json(f"data/clean_datasets/verified_testCases_{VARIATION}_{VERSION}.jsonl", lines=True, orient="records")

In [18]:
df.to_csv(f"data/{VARIATION}_file_metrics_{VERSION}.csv", index=False)