# **Subgraph extraction**

In [15]:
import json
import os
import glob
import logging
import pydot # Import pydot
import re # Import regex for label parsing
# Make sure pydot.Error is accessible if needed for exception handling
from pydot import Error as PydotError

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
base_cpg_path = "data_java/cpg-output"
json_path = "data_java/center_nodes_result_specific.json"
output_base_path = "data_java/subgraph_contexts"

# --- New: Define allowed neighbor labels ---
allowed_neighbor_labels = {
    'arrayInitializer', 'CatchClause', 'stonesoup_array', 'assignment',
    'fieldAccess', 'addition', 'CONTROL_STRUCTURE', 'FIELD_IDENTIFIER',
    'cast', 'IDENTIFIER', 'indexAccess', 'logicalAnd', 'CALL',
    'logicalNot', 'alloc'
}
# Regex to extract the first word from the label attribute, assuming it's the type
label_type_pattern = re.compile(r'^"?([a-zA-Z_<>]+)')

# Load the center nodes data from JSON
try:
    with open(json_path, 'r') as f:
        center_nodes_data = json.load(f)
    logging.info(f"Successfully loaded center nodes data from {json_path}")
except FileNotFoundError:
    logging.error(f"Error: JSON file not found at {json_path}")
    raise # Stop execution if JSON is missing
except json.JSONDecodeError:
    logging.error(f"Error: Could not decode JSON from {json_path}")
    raise # Stop execution if JSON is invalid

# Create the output directory if it doesn't exist
os.makedirs(output_base_path, exist_ok=True)
logging.info(f"Ensured output directory exists: {output_base_path}")

# Helper function to get node type from label attribute
def get_node_type_from_attributes(attrs):
    label_str = attrs.get('label')
    if label_str:
        match = label_type_pattern.match(label_str)
        if match:
            return match.group(1)
    return None

# Process each entry in the JSON data
for folder_name, center_node_ids_str in center_nodes_data.items():
    logging.info(f"Processing folder: {folder_name}")
    # Use the center node IDs from JSON directly
    center_node_ids = set(center_node_ids_str)
    folder_path = os.path.join(base_cpg_path, folder_name)

    # Find the .dot file
    dot_files = glob.glob(os.path.join(folder_path, '*.dot'))

    if not dot_files:
        logging.warning(f"  No .dot file found in {folder_path}. Skipping.")
        continue
    if len(dot_files) > 1:
        logging.warning(f"  Multiple .dot files found in {folder_path}. Using the first one: {dot_files[0]}.")

    dot_file_path = dot_files[0]
    logging.info(f"  Using .dot file: {dot_file_path}")

    # This will store all subgraphs with their center node information
    all_subgraphs = {}
    
    try:
        # Parse the dot file using pydot
        logging.info(f"  Parsing {dot_file_path} with pydot...")
        graphs = pydot.graph_from_dot_file(dot_file_path)

        if not graphs:
            logging.warning(f"  pydot could not parse any graph from {dot_file_path}. Skipping.")
            continue

        if isinstance(graphs, list) and len(graphs) > 0 and isinstance(graphs[0], (pydot.Graph, pydot.Dot)):
            graph = graphs[0] # Assign the first graph object
            logging.info(f"  Successfully parsed graph.")
        else:
            logging.error(f"  pydot.graph_from_dot_file did not return a valid graph object for {dot_file_path}. Skipping.")
            continue

        # --- Build a map for quick node lookup by unquoted ID ---
        logging.info("  Building node map...")
        node_map = {}
        for node in graph.get_nodes():
            unquoted_id = node.get_name().strip('"')
            node_map[unquoted_id] = node
        logging.info(f"  Built map with {len(node_map)} nodes.")

        # Create a neighbor relationship map and edge map for all nodes
        edge_map = {}
        for edge in graph.get_edges():
            source_id = edge.get_source().strip('"')
            dest_id = edge.get_destination().strip('"')
            edge_key = (source_id, dest_id)
            edge_map[edge_key] = edge

        # Process each center node to build individual subgraphs
        for center_node_id in center_node_ids:
            logging.info(f"  Building subgraph for center node: {center_node_id}")
            subgraph_lines = set()
            subgraph_nodes = {center_node_id}
            neighbors_added = set()

            # Find all neighbors of this center node
            for source_id, dest_id in edge_map.keys():
                is_relevant_edge = False
                neighbor_to_add = None

                # Check connection: Center -> Neighbor
                if source_id == center_node_id:
                    potential_neighbor_node = node_map.get(dest_id)
                    if potential_neighbor_node:
                        node_type = get_node_type_from_attributes(potential_neighbor_node.get_attributes())
                        if node_type in allowed_neighbor_labels:
                            neighbor_to_add = dest_id
                            is_relevant_edge = True

                # Check connection: Neighbor -> Center
                elif dest_id == center_node_id:
                    potential_neighbor_node = node_map.get(source_id)
                    if potential_neighbor_node:
                        node_type = get_node_type_from_attributes(potential_neighbor_node.get_attributes())
                        if node_type in allowed_neighbor_labels:
                            neighbor_to_add = source_id
                            is_relevant_edge = True

                if is_relevant_edge:
                    if neighbor_to_add:
                        subgraph_nodes.add(neighbor_to_add)
                        neighbors_added.add(neighbor_to_add)
                    subgraph_lines.add(edge_map[(source_id, dest_id)].to_string().strip())

            # Add node definitions
            for node_id in subgraph_nodes:
                node_obj = node_map.get(node_id)
                if node_obj:
                    subgraph_lines.add(node_obj.to_string().strip())

            # Store this subgraph
            all_subgraphs[center_node_id] = {
                "nodes": list(subgraph_nodes),
                "neighbors": list(neighbors_added),
                "lines": list(subgraph_lines)
            }
            logging.info(f"  Added {len(neighbors_added)} neighbors for center node {center_node_id}.")

        # Write all subgraphs to a single file with clear separation
        output_file_path = os.path.join(output_base_path, f"{folder_name}_context.txt")
        with open(output_file_path, 'w', encoding='utf-8') as f_out:
            # Write a header for the file
            f_out.write(f"# Subgraphs for folder: {folder_name}\n")
            f_out.write(f"# Total center nodes: {len(all_subgraphs)}\n\n")
            
            # Write each subgraph with a clear separator and metadata
            for center_id, subgraph_data in all_subgraphs.items():
                f_out.write(f"START_SUBGRAPH center_node={center_id}\n")
                f_out.write(f"# Node count: {len(subgraph_data['nodes'])}\n")
                f_out.write(f"# Neighbor count: {len(subgraph_data['neighbors'])}\n")
                
                # Write the sorted lines for this subgraph
                for line in sorted(subgraph_data['lines']):
                    f_out.write(f"  {line}\n")
                
                f_out.write(f"END_SUBGRAPH center_node={center_id}\n\n")
            
        logging.info(f"  Successfully wrote {len(all_subgraphs)} subgraphs to {output_file_path}")

    except FileNotFoundError:
        logging.error(f"  Error: .dot file not found at {dot_file_path}")
    except PydotError as e:
        logging.error(f"  A pydot library error occurred processing {dot_file_path}: {e}")
    except Exception as e:
        logging.error(f"  An unexpected {type(e).__name__} occurred while processing {dot_file_path}: {e}", exc_info=True)

logging.info("Subgraph extraction process finished.")

2025-04-28 17:46:28,578 - INFO - Successfully loaded center nodes data from data_java/center_nodes_result_specific.json
2025-04-28 17:46:28,580 - INFO - Ensured output directory exists: data_java/subgraph_contexts
2025-04-28 17:46:28,582 - INFO - Processing folder: 250327-v1.0.0-mixed
2025-04-28 17:46:28,585 - INFO -   Using .dot file: data_java/cpg-output/250327-v1.0.0-mixed/export.dot
2025-04-28 17:46:28,588 - INFO -   Parsing data_java/cpg-output/250327-v1.0.0-mixed/export.dot with pydot...
Exception ignored in: <function tqdm.__del__ at 0x7fb479138540>
Traceback (most recent call last):
  File "/home/keanlt/ML-Project/.venv/lib/python3.12/site-packages/tqdm/std.py", line 1148, in __del__
    self.close()
  File "/home/keanlt/ML-Project/.venv/lib/python3.12/site-packages/tqdm/notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm_notebook' object has no attribute 'disp'
2025-04-28 17:46:32,752 - INFO -   Successfully

# **Generalize code**

In [16]:
import os
import glob
import re
import logging
import json
from tqdm import tqdm

# Import the token extraction function
from extractToken import extract_and_replace_tokens

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
context_base_path = "data_java/subgraph_contexts"
tokenized_output_path = "data_java/tokenized_contexts"

# Create the output directory if it doesn't exist
os.makedirs(tokenized_output_path, exist_ok=True)
logging.info(f"Ensured output directory exists: {tokenized_output_path}")

# Function to extract and process Java code fragments from context files
def process_context_file(context_file_path):
    # Get the base name for output files
    base_name = os.path.basename(context_file_path).replace('_context.txt', '')
    output_file_path = os.path.join(tokenized_output_path, f"{base_name}_tokenized.txt")
    
    # Dictionary to store tokenized code fragments by center node
    tokenized_subgraphs = {}
    
    # Process the file
    with open(context_file_path, 'r', encoding='utf-8') as f_in:
        current_center_node = None
        in_subgraph = False
        code_fragments = {}
        
        for line in f_in:
            line = line.strip()
            
            # Check for subgraph markers
            center_match = re.match(r'START_SUBGRAPH\s+center_node=(\w+)', line)
            if center_match:
                current_center_node = center_match.group(1).strip('"')
                in_subgraph = True
                code_fragments[current_center_node] = []
                continue
            
            if line.startswith('END_SUBGRAPH'):
                in_subgraph = False
                continue
            
            # Skip empty lines and comments
            if not line or line.startswith('#'):
                continue
                
            # If we're in a subgraph, extract CODE attribute from node definitions
            if in_subgraph and current_center_node:
                # Extract node type from the label attribute
                node_type = None
                type_match = re.search(r'label="([^"]+)"', line)
                if type_match:
                    # Extract the type from the label (first part before any comma or full label if no comma)
                    node_type = type_match.group(1).split(',')[0] if ',' in type_match.group(1) else type_match.group(1)
                
                # Extract CODE attribute from node labels
                code_match = re.search(r'CODE="([^"]+)"', line)
                if code_match and node_type:
                    code_fragment = code_match.group(1)
                    # Store original code fragment with its type
                    code_fragments[current_center_node].append({
                        'code': code_fragment,
                        'type': node_type
                    })
    
    # Now tokenize each code fragment using the extract_and_replace_tokens function
    for center_node, fragments in code_fragments.items():
        tokenized_fragments = []
        for fragment_data in fragments:
            try:
                tokenized = extract_and_replace_tokens(fragment_data['code'])
                tokenized_fragments.append({
                    'original': fragment_data['code'],
                    'tokenized': tokenized,
                    'type': fragment_data['type']
                })
            except Exception as e:
                logging.warning(f"Error tokenizing fragment '{fragment_data['code']}': {e}")
                tokenized_fragments.append({
                    'original': fragment_data['code'],
                    'tokenized': fragment_data['code'],  # Keep original if tokenization fails
                    'type': fragment_data['type'],
                    'error': str(e)
                })
        
        tokenized_subgraphs[center_node] = tokenized_fragments
    
    # Write to output file
    with open(output_file_path, 'w', encoding='utf-8') as f_out:
        f_out.write(f"# Tokenized code fragments for {base_name}\n")
        f_out.write(f"# Total center nodes: {len(tokenized_subgraphs)}\n\n")
        
        for center_node, fragments in tokenized_subgraphs.items():
            f_out.write(f"CENTER_NODE: {center_node}\n")
            f_out.write(f"FRAGMENT_COUNT: {len(fragments)}\n")
            
            for i, fragment in enumerate(fragments):
                f_out.write(f"  ORIGINAL[{i}]: {fragment['original']}\n")
                f_out.write(f"  TYPE[{i}]: {fragment['type']}\n")
                f_out.write(f"  TOKENIZED[{i}]: {fragment['tokenized']}\n")
                if 'error' in fragment:
                    f_out.write(f"  ERROR[{i}]: {fragment['error']}\n")
            
            f_out.write("\n")
    
    logging.info(f"Processed {len(tokenized_subgraphs)} center nodes with code fragments from {context_file_path}")
    return tokenized_subgraphs

# Find all context files
context_files = glob.glob(os.path.join(context_base_path, '*_context.txt'))

if not context_files:
    logging.warning(f"No context files found in {context_base_path}. Nothing to process.")
else:
    logging.info(f"Found {len(context_files)} context files to process.")
    
    # Process each context file
    all_tokenized_data = {}
    for context_file_path in tqdm(context_files, desc="Processing context files"):
        base_name = os.path.basename(context_file_path).replace('_context.txt', '')
        logging.info(f"Processing file: {base_name}")
        
        try:
            tokenized_subgraphs = process_context_file(context_file_path)
            all_tokenized_data[base_name] = tokenized_subgraphs
            
            # Count total fragments tokenized
            total_fragments = sum(len(fragments) for fragments in tokenized_subgraphs.values())
            logging.info(f"Tokenized {total_fragments} code fragments from {len(tokenized_subgraphs)} center nodes in {base_name}")
            
        except Exception as e:
            logging.error(f"Error processing {context_file_path}: {e}", exc_info=True)
    
    # Save consolidated JSON for all tokenized data
    output_json_path = os.path.join(tokenized_output_path, "all_tokenized_fragments.json")
    with open(output_json_path, 'w', encoding='utf-8') as f_json:
        json.dump(all_tokenized_data, f_json, indent=2)
    
    logging.info(f"Saved consolidated tokenized data to {output_json_path}")
    logging.info(f"Token extraction process complete. Results saved to {tokenized_output_path}")

2025-04-28 17:47:07,639 - INFO - Ensured output directory exists: data_java/tokenized_contexts
2025-04-28 17:47:07,641 - INFO - Found 3 context files to process.
Processing context files:   0%|          | 0/3 [00:00<?, ?it/s]2025-04-28 17:47:07,644 - INFO - Processing file: 250327-v1.0.0-mixed
2025-04-28 17:47:07,648 - INFO - Processed 35 center nodes with code fragments from data_java/subgraph_contexts/250327-v1.0.0-mixed_context.txt
2025-04-28 17:47:07,649 - INFO - Tokenized 124 code fragments from 35 center nodes in 250327-v1.0.0-mixed
2025-04-28 17:47:07,649 - INFO - Processing file: 157018-v1.0.0-bad
2025-04-28 17:47:07,662 - INFO - Processed 177 center nodes with code fragments from data_java/subgraph_contexts/157018-v1.0.0-bad_context.txt
2025-04-28 17:47:07,663 - INFO - Tokenized 739 code fragments from 177 center nodes in 157018-v1.0.0-bad
2025-04-28 17:47:07,663 - INFO - Processing file: 1553-v1.0.0-good
2025-04-28 17:47:07,667 - INFO - Processed 19 center nodes with code fra

# **Visualization with node ID**

In [None]:
import os
import glob
import logging
import pydot
import re
from IPython.display import Image, display
from pydot import Error as PydotError
import colorsys

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
context_base_path = "data_java/subgraph_contexts"
visualization_output_path = "data_java/subgraph_visualizations"

# Create the visualization output directory if it doesn't exist
os.makedirs(visualization_output_path, exist_ok=True)
logging.info(f"Ensured output directory exists: {visualization_output_path}")

# Helper function to generate a color palette
def generate_color_palette(n):
    """Generate n distinct colors using HSV color space"""
    colors = []
    for i in range(n):
        # Use golden ratio to spread colors evenly
        h = i * 0.618033988749895 % 1.0
        # Fixed saturation and value for vibrant colors
        s = 0.7 + 0.3 * (i % 2)  # Alternate between 0.7 and 1.0
        v = 0.7 + 0.1 * (i % 3)  # Slight variation in brightness
        
        # Convert HSV to RGB
        r, g, b = colorsys.hsv_to_rgb(h, s, v)
        
        # Convert to hex color
        hex_color = f"#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}"
        colors.append(hex_color)
    return colors

# Find all context files
context_files = glob.glob(os.path.join(context_base_path, '*_context.txt'))

if not context_files:
    logging.warning(f"No context files found in {context_base_path}. Nothing to visualize.")
else:
    logging.info(f"Found {len(context_files)} context files to visualize.")

# Process and visualize each context file
for context_file_path in context_files:
    base_name = os.path.basename(context_file_path).replace('_context.txt', '')
    logging.info(f"Processing visualization for: {base_name}")
    output_png_path = os.path.join(visualization_output_path, f"{base_name}_subgraph.png")

    try:
        # Read the subgraph content and extract center nodes
        logging.info(f"  Reading context file: {context_file_path}")
        center_nodes = set()
        subgraph_content_lines = []
        center_node_to_subgraph_map = {}  # Map to track which neighbors belong to which center node
        
        with open(context_file_path, 'r', encoding='utf-8') as f_in:
            current_center_node = None
            current_subgraph_nodes = set()
            in_subgraph = False
            
            for line in f_in:
                line = line.strip()
                # Skip empty lines and comments
                if not line or line.startswith('#'):
                    continue
                    
                # Check for subgraph markers
                center_match = re.match(r'START_SUBGRAPH\s+center_node=(\w+)', line)
                if center_match:
                    current_center_node = center_match.group(1).strip('"')
                    center_nodes.add(current_center_node)
                    current_subgraph_nodes = {current_center_node}
                    in_subgraph = True
                    continue
                
                if line.startswith('END_SUBGRAPH'):
                    if current_center_node:
                        center_node_to_subgraph_map[current_center_node] = current_subgraph_nodes
                    current_center_node = None
                    in_subgraph = False
                    continue
                
                # Add the actual graph content
                subgraph_content_lines.append(line)
                
                # Extract node IDs for the current subgraph
                if in_subgraph:
                    # Extract node definition (e.g., "123" [label="..."])
                    node_def_match = re.match(r'"?([^"]+)"?\s*\[', line)
                    if node_def_match:
                        node_id = node_def_match.group(1)
                        current_subgraph_nodes.add(node_id)
                        
                    # Extract edge definition (e.g., "123" -> "456")
                    edge_def_match = re.match(r'"?([^"]+)"?\s*->\s*"?([^"]+)"?', line)
                    if edge_def_match:
                        source_id, target_id = edge_def_match.groups()
                        current_subgraph_nodes.add(source_id)
                        current_subgraph_nodes.add(target_id)
        
        # Generate colors for center nodes
        color_palette = generate_color_palette(len(center_nodes))
        center_node_colors = dict(zip(center_nodes, color_palette))
        
        logging.info(f"  Found {len(center_nodes)} center nodes with unique colors")
        subgraph_content = '\n'.join(subgraph_content_lines)
        logging.info(f"  Successfully read context file with {len(subgraph_content_lines)} content lines.")

        # Wrap the content in a valid DOT structure
        dot_string = f"""digraph "{base_name}_subgraph" {{
  graph [rankdir=LR];
  node [shape=box, fontname="Courier New"];
  edge [arrowsize=0.5, fontsize=8];
  
{subgraph_content}
}}"""

        # Parse the DOT string using pydot
        logging.info("  Parsing DOT data...")
        graphs = pydot.graph_from_dot_data(dot_string)

        if not graphs:
            logging.warning(f"  Could not parse DOT data from generated string for {base_name}. Skipping visualization.")
            continue

        graph = graphs[0] # Assume the first graph is the one we want
        logging.info("  Successfully parsed DOT data.")

        # Set colors for nodes based on whether they are center nodes or their neighbors
        logging.info("  Applying color styling to nodes...")
        nodes_styled = 0
        center_nodes_found = 0
        neighbor_nodes_found = 0
        
        node_color_assignments = {}  # To help with logging
        
        for node in graph.get_nodes():
            try:
                node_id = node.get_name().strip('"')  # Remove quotes for comparison
                
                if node_id in center_nodes:
                    # Center nodes get their unique color from the palette
                    node_color = center_node_colors[node_id]
                    node.set('style', 'filled')
                    node.set('fillcolor', node_color)
                    node.set('color', '#000000')  # Black border
                    node.set('penwidth', '2.0')  # Thicker border
                    node.set('fontcolor', 'white')  # White text for contrast
                    center_nodes_found += 1
                    node_color_assignments[node_id] = f"center: {node_color}"
                else:
                    # Find which center node's subgraph this neighbor belongs to
                    assigned_center = None
                    for center, subgraph_nodes in center_node_to_subgraph_map.items():
                        if node_id in subgraph_nodes:
                            assigned_center = center
                            break
                    
                    if assigned_center:
                        # Derive a lighter version of the center node's color for its neighbors
                        center_color = center_node_colors[assigned_center]
                        # Create lighter version by parsing RGB and increasing lightness
                        r = int(center_color[1:3], 16)
                        g = int(center_color[3:5], 16)
                        b = int(center_color[5:7], 16)
                        # Make it lighter (increase RGB values)
                        r = min(255, r + 90)
                        g = min(255, g + 90)
                        b = min(255, b + 90)
                        light_color = f"#{r:02x}{g:02x}{b:02x}"
                        
                        node.set('style', 'filled')
                        node.set('fillcolor', light_color)
                        node.set('color', center_color)  # Border uses center node color
                        node_color_assignments[node_id] = f"neighbor of {assigned_center}: {light_color}"
                    else:
                        # Default light gray for unassigned nodes
                        node.set('style', 'filled')
                        node.set('fillcolor', '#EEEEEE')
                        node.set('color', '#999999')
                        node_color_assignments[node_id] = "unassigned: #EEEEEE"
                    
                    neighbor_nodes_found += 1
                
                # Set a simplified label with the ID
                node.set('label', f"Node {node_id}")
                nodes_styled += 1
                
            except Exception as style_err:
                logging.warning(f"    Could not style node {node.get_name()}: {style_err}")
        
        logging.info(f"  Styled {nodes_styled} nodes: {center_nodes_found} center nodes and {neighbor_nodes_found} neighbor nodes.")
        logging.debug(f"  Color assignments: {node_color_assignments}")

        # Generate the PNG image
        try:
            logging.info(f"  Attempting to render PNG to {output_png_path}...")
            # Test with plain format first
            test_txt_path = f"{output_png_path}.txt"
            graph.write(test_txt_path, format="plain")
            logging.info(f"  Successfully wrote plain DOT text to {test_txt_path}.")
            
            # Now try PNG
            png_created = graph.write_png(output_png_path)
            if png_created is False:
                logging.error(f"  graph.write_png returned False for {output_png_path}.")
                continue

            logging.info(f"  Successfully rendered PNG.")
            
            # Display the image in the notebook
            display(Image(filename=output_png_path))
            print("-" * 40)
            logging.info(f"  Successfully displayed image.")

        except FileNotFoundError:
            logging.error(f"  Error: Failed to find or create file. Check permissions.")
        except PydotError as pe:
            logging.error(f"  Pydot error during file operations: {pe}")
        except AssertionError as ae:
            logging.error(f"  AssertionError during rendering: {ae}")
            # Try to extract and print graphviz's stderr output from the error message
            print(str(ae))
            if "returned code: 1" in str(ae):
                print("\nstdout, stderr:\n", " b''")  # Placeholder pattern
                # Try to work around by generating SVG instead of PNG
                try:
                    logging.info(f"  Trying alternative SVG format...")
                    svg_path = f"{output_png_path}.svg"
                    graph.write_svg(svg_path)
                    logging.info(f"  Successfully wrote SVG to {svg_path}")
                    display(Image(filename=svg_path))
                except Exception as svg_err:
                    logging.error(f"  SVG fallback also failed: {svg_err}")
        except Exception as e:
            logging.error(f"  Unexpected error: {e}", exc_info=True)

    except FileNotFoundError:
        logging.error(f"  Error: Context file not found at {context_file_path}")
    except PydotError as parse_err:
        logging.error(f"  Pydot error parsing DOT data: {parse_err}")
    except Exception as e:
        logging.error(f"  Unexpected error processing {base_name}: {e}", exc_info=True)

logging.info("Subgraph visualization process finished.")

# **Visualization without node ID**

In [None]:
import os
import glob
import logging
import pydot
from IPython.display import Image, display
from pydot import Error as PydotError # Import the correct error

# Configure logging (optional, but helpful)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Define paths
context_base_path = "data_java/subgraph_contexts"
visualization_output_path = "data_java/subgraph_visualizations_test"

# Create the visualization output directory if it doesn't exist
os.makedirs(visualization_output_path, exist_ok=True)
logging.info(f"Ensured output directory exists: {visualization_output_path}")

# Find all context files
context_files = glob.glob(os.path.join(context_base_path, '*_context.txt'))

if not context_files:
    logging.warning(f"No context files found in {context_base_path}. Nothing to visualize.")
else:
    logging.info(f"Found {len(context_files)} context files to visualize.")

# Process and visualize each context file
for context_file_path in context_files:
    base_name = os.path.basename(context_file_path).replace('_context.txt', '')
    logging.info(f"Processing visualization for: {base_name}")
    output_png_path = os.path.join(visualization_output_path, f"{base_name}_subgraph.png")
    dot_string = "" # Initialize dot_string

    try:
        # Read the subgraph content
        logging.info(f"  Reading context file: {context_file_path}")
        with open(context_file_path, 'r', encoding='utf-8') as f_in:
            subgraph_content = f_in.read()
        logging.info(f"  Successfully read context file.")

        # Wrap the content in a valid DOT structure
        dot_string = f"digraph \"{base_name}_subgraph\" {{\n graph [rankdir=LR];\n node [shape=box];\n edge [arrowsize=0.5];\n{subgraph_content}\n}}"

        # Parse the DOT string using pydot
        logging.info("  Parsing DOT data...")
        graphs = pydot.graph_from_dot_data(dot_string)

        if not graphs:
            logging.warning(f"  Could not parse DOT data from generated string for {base_name}. Skipping visualization.")
            continue

        graph = graphs[0] # Assume the first graph is the one we want
        logging.info("  Successfully parsed DOT data.")

        # Generate the PNG image - Separate Try/Except for rendering/display
        try:
            logging.info(f"  Attempting to render PNG to {output_png_path}...")
            png_created = graph.write_png(output_png_path)
            # write_png might return True/False or raise an error depending on version/scenario
            if png_created is False: # Explicitly check if write_png indicated failure
                 logging.error(f"  graph.write_png returned False for {output_png_path}. Check Graphviz installation and DOT data validity.")
                 continue # Skip display if rendering failed

            logging.info(f"  Successfully rendered PNG (or write_png did not raise error).")

            # Display the image in the notebook
            logging.info(f"  Attempting to display image: {output_png_path}")
            display(Image(filename=output_png_path))
            print("-" * 40) # Separator
            logging.info(f"  Successfully displayed image.")

        except FileNotFoundError:
            logging.error(f"  Error: Failed to display image. PNG file not found at {output_png_path}. Check if graph.write_png() succeeded.")
        except PydotError as pe:
             logging.error(f"  A pydot error occurred during PNG rendering/writing for {base_name}: {pe}. Check Graphviz installation and PATH.")
        except Exception as render_err:
             logging.error(f"  An unexpected {type(render_err).__name__} occurred during PNG rendering/display for {base_name}: {render_err}", exc_info=True)


    except FileNotFoundError:
        # This now correctly catches only if the *context file* is missing
        logging.error(f"  Error: Context file not found at {context_file_path}")
    except PydotError as parse_err:
         # This catches errors during the initial DOT string parsing
         logging.error(f"  A pydot error occurred parsing DOT string for {base_name}: {parse_err}")
         logging.debug(f"  Problematic DOT string:\n{dot_string}") # Log the string that failed
    except Exception as e:
        # General catch for other unexpected errors during file read/parse
        logging.error(f"  An unexpected {type(e).__name__} occurred while processing {base_name}: {e}", exc_info=True)

logging.info("Subgraph visualization process finished.")