In [1]:
# list all output variables
import re
import os

def extract_all_hist_calls(filename):
    """
    First step: Extract all instances of "call hist_addfld1d" or "call hist_addfld2d" calls.
    Returns a list of complete function call strings.
    """
    
    with open(filename, 'r') as file:
        lines = file.readlines()
    
    hist_calls = []
    i = 0
    
    while i < len(lines):
        line = lines[i].strip()
        
        # Check for termination marker
        if '![terminate]' in line:
            print(f"Found termination marker at line {i+1}. Stopping extraction.")
            break
        
        # Look for hist_addfld calls
        if re.search(r'call\s+hist_addfld[12]d\s*\(', line, re.IGNORECASE):
            # Collect the complete multi-line statement
            statement_lines = []
            j = i
            
            # Keep collecting lines until we have a complete statement
            while j < len(lines):
                current_line = lines[j].rstrip()
                statement_lines.append(current_line)
                
                # Check if this line continues to the next (ends with &)
                if current_line.endswith('&'):
                    # Continue to next line
                    j += 1
                    continue
                else:
                    # This line doesn't continue, check if parentheses are balanced
                    full_statement = ' '.join(statement_lines)
                    # Remove & characters for counting parentheses
                    clean_statement = full_statement.replace('&', '')
                    
                    paren_count = clean_statement.count('(') - clean_statement.count(')')
                    
                    if paren_count <= 0:
                        # Parentheses are balanced, statement is complete
                        break
                    else:
                        # Need more lines to balance parentheses
                        j += 1
                        continue
            
            # Join all lines and clean up
            full_call = ' '.join(statement_lines)
            # Remove & characters and normalize whitespace
            full_call = re.sub(r'\s*&\s*', ' ', full_call)
            full_call = re.sub(r'\s+', ' ', full_call.strip())
            
            # Store the complete call with metadata
            call_info = {
                'full_call': full_call,
                'line_start': i + 1,
                'line_end': j + 1,
                'call_type': "hist_addfld2d" if "hist_addfld2d" in full_call.lower() else "hist_addfld1d"
            }
            
            hist_calls.append(call_info)
            
            # Move to the line after this call
            i = j + 1
        else:
            i += 1
    
    return hist_calls

def extract_parameters_from_call(call_text):
    """
    Second step: Extract fname, units, and long_name from a single hist_addfld call.
    """
    
    # Extract fname
    fname = extract_parameter_value(call_text, 'fname')
    
    # Extract units
    units = extract_parameter_value(call_text, 'units')
    
    # Extract long_name
    long_name = extract_parameter_value(call_text, 'long_name')

    default_stat=extract_parameter_value(call_text, 'default')
    if default_stat=='NOT_FOUND':default_stat='active'
    return {
        'fname': fname,
        'units': units,
        'long_name': long_name,
        'default':default_stat
    }

def extract_parameter_value(text, parameter_name):
    """
    Extract the value of a specific parameter from the function call text.
    Handles string concatenation with //.
    """
    
    # Pattern to find parameter=value, accounting for various spacing
    pattern = rf"{parameter_name}\s*=\s*"
    match = re.search(pattern, text, re.IGNORECASE)
    
    if not match:
        return "NOT_FOUND"
    
    # Find the start of the value
    start_pos = match.end()
    
    # Find the end of the value (next comma or closing parenthesis at the same nesting level)
    value_end = find_parameter_value_end(text, start_pos)
    
    if value_end == -1:
        return "NOT_FOUND"
    
    value_part = text[start_pos:value_end].strip()
    
    # Process the value (handle string concatenation if present)
    return process_string_value(value_part)

def find_parameter_value_end(text, start_pos):
    """
    Find the end position of a parameter value, considering nested parentheses and quotes.
    """
    
    paren_count = 0
    in_quotes = False
    quote_char = None
    
    for i in range(start_pos, len(text)):
        char = text[i]
        
        if not in_quotes:
            if char in ['"', "'"]:
                in_quotes = True
                quote_char = char
            elif char == '(':
                paren_count += 1
            elif char == ')':
                if paren_count == 0:
                    return i
                paren_count -= 1
            elif char == ',' and paren_count == 0:
                return i
        else:
            if char == quote_char:
                in_quotes = False
                quote_char = None
    
    return len(text)

def process_string_value(value_part):
    """
    Process a string value, handling Fortran string concatenation with //.
    """
    
    # Remove leading/trailing whitespace
    value_part = value_part.strip()
    
    # If no concatenation, simple string extraction
    if '//' not in value_part:
        string_match = re.search(r"['\"]([^'\"]*)['\"]", value_part)
        return string_match.group(1) if string_match else "NOT_FOUND"
    
    # Handle string concatenation
    return handle_string_concatenation(value_part)

def handle_string_concatenation(value_part):
    """
    Handle Fortran string concatenation with // operator.
    """
    
    result_parts = []
    current_part = ""
    in_quotes = False
    quote_char = None
    i = 0
    
    while i < len(value_part):
        char = value_part[i]
        
        if not in_quotes:
            if char in ['"', "'"]:
                # Starting a quoted string
                in_quotes = True
                quote_char = char
                current_part += char
            elif i < len(value_part) - 1 and value_part[i:i+2] == '//':
                # Found concatenation operator
                if current_part.strip():
                    # Process the current part
                    processed = process_single_string_part(current_part.strip())
                    if processed != "NOT_FOUND":
                        result_parts.append(processed)
                current_part = ""
                i += 1  # Skip the second '/'
            else:
                current_part += char
        else:
            current_part += char
            if char == quote_char:
                in_quotes = False
                quote_char = None
        
        i += 1
    
    # Process the last part
    if current_part.strip():
        processed = process_single_string_part(current_part.strip())
        if processed != "NOT_FOUND":
            result_parts.append(processed)
    
    return ''.join(result_parts) if result_parts else "NOT_FOUND"

def process_single_string_part(part):
    """
    Process a single part of a string expression.
    """
    
    part = part.strip()
    
    # Extract quoted string
    string_match = re.search(r"['\"]([^'\"]*)['\"]", part)
    if string_match:
        return string_match.group(1)
    
    # If not a quoted string, it might be a variable - mark it
    if re.match(r'^[a-zA-Z_]\w*$', part):
        return f"[{part}]"
    
    return "NOT_FOUND"

def process_hist_calls(hist_calls):
    """
    Process all hist_addfld calls to extract parameters.
    """
    
    results = []
    
    for i, call_info in enumerate(hist_calls):
#        print(f"Processing call {i+1}/{len(hist_calls)} (lines {call_info['line_start']}-{call_info['line_end']})")
        
        # Extract parameters from this call
        parameters = extract_parameters_from_call(call_info['full_call'])
        
        # Combine with call info
        result = {
            'call_type': call_info['call_type'],
            'fname': parameters['fname'],
            'units': parameters['units'],
            'long_name': parameters['long_name'],
            'default': parameters['default'],
            'line_range': f"{call_info['line_start']}-{call_info['line_end']}"
        }
        
        results.append(result)
    
    return results

def write_simple_results(results, output_filename):
    """
    Write only the summary table format to the output file.
    """
    
    with open(output_filename, 'w') as f:
        # Header
        f.write(f"{'#':<4} {'Type':<15} {'Field Name':<30} {'Units':<25} {'Default':<15} {'Long Name'}\n")
        
        # Data rows
        for i, result in enumerate(results, 1):
            f.write(f"{i:<4} {result['call_type']:<15} {result['fname']:<30} {result['units']:<25} {result['default']:<15} {result['long_name']}\n")

# Main execution
input_filename = "../../f90src/IOutils/HistDataType.F90"
output_filename = "hist_addfld_extraction_results.txt"    

if os.path.exists(input_filename):
    print(f"Processing file: {input_filename}")
    
    try:
        # Step 1: Extract all hist_addfld calls
        print("Step 1: Extracting all hist_addfld calls...")
        hist_calls = extract_all_hist_calls(input_filename)
        print(f"Found {len(hist_calls)} hist_addfld calls")
        
        # Step 2: Extract parameters from each call
        print("Step 2: Extracting parameters from each call...")
        results = process_hist_calls(hist_calls)
        
        # Write results
        write_simple_results(results, output_filename)
        print(f"Results written to: {output_filename}")
        
        # Also print to console
        print(f"\n{'#':<4} {'Type':<15} {'Field Name':<30} {'Units':<25} {'Default':<15} {'Long Name'}")
        for i, result in enumerate(results, 1):
            print(f"{i:<4} {result['call_type']:<15} {result['fname']:<30} {result['units']:<25} {result['default']:<15} {result['long_name']} ")
        
    except Exception as e:
        print(f"Error processing file: {e}")
        import traceback
        traceback.print_exc()
else:
    print(f"File not found: {input_filename}")


    


Processing file: ../../f90src/IOutils/HistDataType.F90
Step 1: Extracting all hist_addfld calls...
Found termination marker at line 3447. Stopping extraction.
Found 525 hist_addfld calls
Step 2: Extracting parameters from each call...
Results written to: hist_addfld_extraction_results.txt

#    Type            Field Name                     Units                     Default         Long Name
1    hist_addfld1d   cumFIRE_CO2_col                gC m-2                    inactive        cumulative CO2 flux from fire (<0 into atmosphere) 
2    hist_addfld1d   cumFIRE_CH4_col                gC d-2                    inactive        cumulative CH4 flux from fire (<0 into atmosphere) 
3    hist_addfld1d   cNH4_LITR_col                  gN NH4/g litter           inactive        NH4 concentration in litter 
4    hist_addfld1d   cNO3_LITR_col                  gN NO3/g litter           inactive        NO3 concentration in litter 
5    hist_addfld1d   ECO_HVST_C_col                 gC/m2          