In [None]:
# Explore ReposVul Dataset

In [1]:
# If your venv already has these, you can skip this cell.
%pip install pandas orjson ujson matplotlib tqdm tabulate networkx python-levenshtein


^C
Note: you may need to restart the kernel to use updated packages.


Collecting orjson
  Downloading orjson-3.11.3-cp311-cp311-win_amd64.whl.metadata (43 kB)
Collecting ujson
  Downloading ujson-5.11.0-cp311-cp311-win_amd64.whl.metadata (9.6 kB)
Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting python-levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-levenshtein)
  Downloading levenshtein-0.27.1-cp311-cp311-win_amd64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-levenshtein)
  Downloading rapidfuzz-3.14.1-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading orjson-3.11.3-cp311-cp311-win_amd64.whl (131 kB)
Downloading ujson-5.11.0-cp311-cp311-win_amd64.whl (43 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp311-cp311-win_amd64.whl (100 kB)
Downloading rapidfuzz-3.14.1-cp311-cp311-wi

In [None]:
from pathlib import Path
import json, orjson, ujson, re, difflib, itertools, math
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
from collections import Counter, defaultdict
from tqdm import tqdm

DATA_DIR = Path(r"data/dataset/ReposVul_c_cpp")

FILES = {
    "train": DATA_DIR / "train_c_cpp_repository2.jsonl",
    "valid": DATA_DIR / "valid_c_cpp_repository2.jsonl",
    "test" : DATA_DIR / "test_c_cpp_repository2.jsonl",
}
for split, p in FILES.items():
    assert p.exists(), f"Missing: {split} â†’ {p}"
print("All files exist.")

In [None]:
# Define FIELD_MAP and top_counts function
FIELD_MAP = {
    "cwe": "cwe",
    "severity": "severity",
    "func": "func",
    "target": "target"
}

def top_counts(filepath, field_map):
    """Count occurrences of fields in the dataset"""
    counts = {}
    try:
        with open(filepath, "r") as f:
            for line in f:
                try:
                    data = json.loads(line.strip())
                    for field_key, field_name in field_map.items():
                        if field_name in data:
                            value = data[field_name]
                            if field_key not in counts:
                                counts[field_key] = Counter()
                            counts[field_key][value] += 1
                except json.JSONDecodeError:
                    continue
    except FileNotFoundError:
        print(f"File not found: {filepath}")
        return
    
    # Display top counts for each field
    for field_key, counter in counts.items():
        print(f"
Top {field_key.upper()}:")
        for value, count in counter.most_common(10):
            print(f"  {value}: {count}")

# Process each file
for split, fp in FILES.items():
    print(f"
==== {split.upper()} ====")
    top_counts(fp, FIELD_MAP)
