In [1]:
# ===============================
# Project: Context-Aware Comment Generator for C Code
# Phase 1 + 2: Dataset Preparation + AST Linearization
# Dataset Location: C:\PBL-Compiler\DATASET (FormAI Dataset)
# ===============================

import os
import json
import re
import pycparser
from pycparser import parse_file, c_ast


In [None]:

# ===== CONFIGURATION =====
DATASET_DIR = 'C:/Users/Downloads/FormAI_dataset_C_samples-V1/DATASET'
OUTPUT_FILE = "structured_dataset_with_ast.json"

# ===== PREPROCESSING SKIP CHECK =====
if os.path.exists(OUTPUT_FILE):
    print(f"✅ File '{OUTPUT_FILE}' already exists. Skipping preprocessing.")
    exit()

# Get path to pycparser's fake libc includes
FAKE_LIBC_DIR = os.path.join(os.path.dirname(pycparser.__file__), 'utils', 'fake_libc_include')

processed_data = []
skipped_files = []
print_limit = 5
print_count = 0

# === UTILITY FUNCTIONS ===

def remove_metadata(code):
    lines = code.strip().splitlines()
    if lines and lines[0].startswith("//FormAI"):
        return "\n".join(lines[1:])
    return code

# ✅ NEW: Extract real comments and strip them from code
def extract_comments_and_strip(code):
    single_line_comments = re.findall(r'//.*', code)
    multi_line_comments = re.findall(r'/\*[\s\S]*?\*/', code)
    all_comments = single_line_comments + multi_line_comments
    code_wo_comments = re.sub(r'//.*|/\*[\s\S]*?\*/', '', code)
    return all_comments, code_wo_comments

def clean_code_for_parser(code):
    lines = code.splitlines()
    lines = [line for line in lines if not line.strip().startswith("#")]
    return "\n".join(lines)

def linearize_ast(ast):
    result = []
    def visit(node):
        if isinstance(node, c_ast.Node):
            result.append(node.__class__.__name__)
            for _, child in node.children():
                visit(child)
    visit(ast)
    return " ".join(result)

# === MAIN LOOP ===

print("🔍 Processing files...")

for file in os.listdir(DATASET_DIR):
    if file.endswith(".c"):
        print(f"Processing {file} ...")
        filepath = os.path.join(DATASET_DIR, file)
        with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
            raw_code = f.read()

        # Step 1: Remove FormAI metadata
        code_clean_meta = remove_metadata(raw_code)

        # ✅ Step 2: Extract and strip real comments
        extracted_comments, code_clean = extract_comments_and_strip(code_clean_meta)

        # ✅ Skip files with no comments
        if not extracted_comments:
            skipped_files.append((file, "No comments found in file"))
            continue

        # Step 3: Clean preprocessor lines for parsing
        code_for_ast = clean_code_for_parser(code_clean)

        # Step 4: Write cleaned code to temp file for pycparser
        tmp_file = 'temp.c'
        with open(tmp_file, 'w', encoding='utf-8') as tf:
            tf.write(code_for_ast)

        # Step 5: Parse with pycparser
        try:
            ast = parse_file(tmp_file, use_cpp=True, cpp_args=[r'-I' + FAKE_LIBC_DIR])
        except Exception as e:
            skipped_files.append((file, f"AST parsing failed: {e}"))
            continue

        # Step 6: Linearize AST
        ast_linear = linearize_ast(ast)

        # Step 7: Combine AST and code for input
        combined_input = code_clean + " <AST> " + ast_linear

        # ✅ Step 8: Use real comment(s)
        comment = " ".join(extracted_comments).strip()

        # Optional: Debug info
        if print_count < print_limit:
            print(f"\nFile: {file}")
            try:
                ast.show()
            except:
                print("[!] AST tree display error")
            print("Linearized AST:", ast_linear)
            print("Combined Input:", combined_input)
            print("Extracted Comment:", comment)
            print("=" * 50)
            print_count += 1

        # Step 9: Append processed data
        processed_data.append({
            "file": file,
            "code": code_clean.strip(),
            "comment": comment,
            "ast_linear": ast_linear,
            "combined_input": combined_input
        })

# Remove temporary file
if os.path.exists(tmp_file):
    os.remove(tmp_file)

# Save processed dataset
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(processed_data, f, indent=2)

print(f"✅ Saved {len(processed_data)} valid C programs to '{OUTPUT_FILE}'")
print(f"🚫 Skipped {len(skipped_files)} files due to errors.")

if skipped_files:
    with open("skipped_files_log.txt", "w") as logf:
        for fname, reason in skipped_files:
            logf.write(f"{fname}: {reason}\n")
    print("📄 Skipped file log saved to 'skipped_files_log.txt'")


🔍 Processing files...
Processing FormAI_1.c ...

File: FormAI_1.c
FileAST: 
  FuncDef: 
    Decl: main, [], [], [], []
      FuncDecl: 
        TypeDecl: main, [], None
          IdentifierType: ['int']
    Compound: 
      Decl: n, [], [], [], []
        TypeDecl: n, [], None
          IdentifierType: ['int']
        Constant: int, 7
      Decl: arr, [], [], [], []
        ArrayDecl: []
          ArrayDecl: []
            TypeDecl: arr, [], None
              IdentifierType: ['int']
            ID: n
          ID: n
      For: 
        DeclList: 
          Decl: i, [], [], [], []
            TypeDecl: i, [], None
              IdentifierType: ['int']
            Constant: int, 0
        BinaryOp: <
          ID: i
          ID: n
        UnaryOp: p++
          ID: i
        Compound: 
          For: 
            DeclList: 
              Decl: j, [], [], [], []
                TypeDecl: j, [], None
                  IdentifierType: ['int']
                Constant: int, 0
            B