# Two-Pass CSV Reading Test
Testing an approach to handle multi-line CSV fields while preserving header/footer handling

In [17]:
import pandas as pd

In [2]:
def count_csv_commas(line):
    """Count commas that separate CSV fields, using pandas to handle quoted fields
    
    Args:
        line: A string containing one or more CSV fields
        
    Returns:
        int: Number of commas between fields (field count - 1)
    """
    # Use StringIO to create a file-like object from the line
    from io import StringIO
    
    try:
        # Parse the line with pandas, properly handling quoted fields
        df = pd.read_csv(StringIO(line), 
                        quotechar='"',
                        header=None)
        
        # Return number of commas (number of fields - 1)
        return len(df.columns) - 1
        
    except pd.errors.EmptyDataError:
        # Handle empty lines
        return 0
    except Exception as e:
        # Log any other parsing errors
        print(f"Error parsing line: {e}")
        return 0

In [3]:
def get_complete_record(f):
    """Read lines until we have a complete CSV record (all quotes matched).
    
    Args:
        f: File object to read from
        
    Returns:
        tuple: (complete_record, start_line, last_line, is_empty)
        - complete_record: The assembled record with all quotes matched
        - start_line: The line number where the record started
        - last_line: The line number where the record ended
        - is_empty: True if record is empty/blank
    """
    current_line = ''
    in_quotes = False
    i = -1  # Initialize i outside the loop
    start_line = None
    last_line = -1
    
    for i, line in enumerate(f):
        # Track the starting line number for this record:
        if not in_quotes and not current_line:
            # Only set start_line when:
            # 1. We're not inside a quoted field (not mid-field)
            # 2. We haven't started accumulating a line (new record)
            #       (current_line = '' for new record)
            # This ensures we get the true start of each record,
            # even for multi-line fields
            start_line = i

        # Handle line accumulation:
        if in_quotes:
            # Inside quotes: append this line to our accumulated record
            # (handles multi-line fields)
            current_line += line
        else:
            # Outside quotes: start a new record
            # (previous record was complete or this is the first line)
            current_line = line
            
        # Track whether we're inside a quoted field:
        for char in line:
            if char == '"':
                # Toggle in_quotes flag when we see a quote mark
                # - False -> True  : entering a quoted field
                # - True  -> False : exiting a quoted field
                in_quotes = not in_quotes
                
        last_line = i
                
        # Process record if complete:
        if not in_quotes:
            # All quotes are matched, so we have a complete record
            # Strip whitespace
            stripped_record = current_line.strip()
            # return the record, start_line, last_line, and whether it's empty
            return stripped_record, start_line, last_line, (stripped_record == '')
    
    # Handle end of file:
    # If we get here, we've read all lines without finding a closing quote
    # This might indicate a malformed CSV, but we'll return what we have
    return (
        current_line.strip(),  # The accumulated record content (if any)
        
        # For the start line number:
        # - Use start_line if we found a valid record start
        # - Fall back to i (last processed line) if start_line was never set
        # This ensures we always return a valid line number, even in error cases
        start_line if start_line is not None else i,
        
        last_line,  # The last line we processed
        
        # Mark as empty (True) because:
        # 1. We reached EOF without finding a closing quote
        # 2. This indicates no complete valid record was found
        # 3. Helps calling code detect end of CSV section
        # 4. Provides graceful handling of malformed CSV files
        True
    )

In [4]:
def find_csv_boundaries(file_path):
    """Find the start and end line numbers of the CSV section"""
    start_line = 0
    end_line = None  # Changed to None to indicate "read to end"
    header_commas = None
    last_line = -1  # Track the last line we processed
    
    with open(file_path, 'r') as f:
        lines = f.readlines()  # Read all lines at once
        i = 0
        while i < len(lines):
            # Create a file-like object from remaining lines
            from io import StringIO
            remaining_lines = StringIO(''.join(lines[i:]))
            
            # Get next complete record (handles multi-line fields)
            record, start_num, end_num, is_empty = get_complete_record(remaining_lines)
            print(f"RECORD: {record}")

            # Adjust line numbers to account for our position in the file
            start_num += i
            end_num += i
            
            # Update last line processed and position in file
            if end_num > last_line:
                last_line = end_num
            i = end_num + 1  # Move past this record
            
            print(f"Record from line {start_num} to {end_num} (empty: {is_empty})")
           
            # Handle empty records:
            if is_empty:
                if header_commas is not None:  
                    # We've already found the CSV header and data,
                    # so this empty line marks the end of the CSV section
                    end_line = last_line
                    break
                # We haven't found the header yet, so this is
                # just a blank line in the header section - skip it
                continue
                
            # Count commas in complete record
            num_commas = count_csv_commas(record)
            print(f"Line {start_num}: {num_commas} commas found")  # Debug print
            
            # Process record based on comma count:
            if header_commas is None:
                # We haven't found the CSV header yet
                if num_commas > 0:
                    # Found a line with commas - assume this is our header
                    # Store the number of commas as a pattern to match
                    # subsequent data lines against
                    header_commas = num_commas
                    start_line = start_num
                    print(f"Found header at line {start_num}")  # Debug print
            elif num_commas != header_commas:
                # This line has a different number of commas than our header
                # This indicates we've hit a footer or non-CSV section
                end_line = last_line + 1
                break
                    
    return start_line, end_line

In [15]:
def find_csv_boundaries(file_path):
    """Find the start and end line numbers of the CSV section"""
    start_line = 0
    end_line = None
    header_field_count = None
    
    with open(file_path, 'r', newline='') as f:  # newline='' is important for CSV
        reader = csv.reader(f)
        for i, row in enumerate(reader):
            # Skip empty rows before header
            if header_field_count is None and len(row) == 0:
                continue
                
            # Process row based on field count
            field_count = len(row)
            print(f"Line {i}: {field_count} fields found")  # Debug
            
            if header_field_count is None:
                # First non-empty row becomes our header
                if field_count > 0:
                    header_field_count = field_count
                    start_line = i
                    print(f"Found header at line {i}")  # Debug
            else:
                # Stop at first empty line or row with different field count
                if field_count == 0 or field_count != header_field_count:
                    end_line = i
                    break
                    
    return start_line, end_line

In [None]:
# Test with Cigna HSA file
file_path = "../data/accounts/fidelity-ira-jmm.csv"
start_line, end_line = find_csv_boundaries(file_path)
print(f"\nCSV section found from line {start_line} to {'end' if end_line is None else end_line}")

# Read the CSV section with pandas for type conversion
data = pd.read_csv(file_path,
                  skiprows=start_line,
                  nrows=end_line - start_line if end_line else None)
data

In [None]:
# Test with Cigna HSA file
file_path = "../data/accounts/cigna-hsa-jmm.csv"
start_line, end_line = find_csv_boundaries(file_path)
print(f"\nCSV section found from line {start_line} to {'end' if end_line is None else end_line}")

# Read the CSV section with pandas
if end_line is None:
    # Read to end of file
    data = pd.read_csv(file_path,
                      skiprows=start_line,
                      quotechar='"')
else:
    # Read specific number of rows
    print(f"Reading {end_line - start_line} rows")  # Debug print
    data = pd.read_csv(file_path,
                      skiprows=start_line,
                      nrows=end_line - start_line,
                      quotechar='"')
data

In [31]:
import csv

# file_path = "../data/accounts/wealthfront-ira-jsm.csv"
# file_path = "../data/accounts/vanguard-ira-jmm.csv"
file_path = "../data/accounts/troweprice-401k-jmm.csv"
# file_path = "../data/accounts/mndcp-457b-jsm.csv"
# file_path = "../data/accounts/fidelity-ira-jmm.csv"
# file_path = "../data/accounts/cigna-hsa-jmm.csv"

rows = []
with open(file_path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        rows.append(row)

df = pd.DataFrame(rows[1:], columns=rows[0])

In [33]:
def is_header_row(row):
    """Check if row contains required column headers.
    
    Args:
        row: List of column names to check
        
    Returns:
        bool: True if row contains required columns
    """
    # Convert all column names to uppercase for case-insensitive comparison
    cols_upper = [col.upper() for col in row]
    
    # Check for symbol column
    has_symbol = any(col in cols_upper for col in ['SYMBOL', 'INVESTMENT'])
    
    # Check for quantity column
    has_quantity = any(col in row for col in ['Quantity', 'Shares', 'UNIT/SHARE OWNED'])
    
    return has_symbol and has_quantity


In [34]:

def read_csv_rows(file_path):
    """Read CSV file and return header and data rows.
    
    Args:
        file_path: Path to CSV file
        
    Returns:
        tuple: (header_row, data_rows)
        - header_row: List of column names
        - data_rows: List of lists containing data values
    """
    header = None
    data_rows = []
    
    with open(file_path, 'r', newline='') as f:
        reader = csv.reader(f)
        for row in reader:
            if not row:  # Skip empty rows
                continue
                
            if header is None:
                # Look for header row with required columns
                if is_header_row(row):
                    header = row
                    header_field_count = len(row)
                # Skip this row if it's not the header
                continue
            
            # Only include rows that match header field count
            if len(row) == header_field_count:
                data_rows.append(row)
            else:
                # Stop at first row with different field count (footer)
                break
                    
    if header is None:
        raise ValueError("Could not find header row with required columns")
        
    return header, data_rows

In [None]:

# Use the function with DataFrame constructor
# file_path = "../data/accounts/wealthfront-ira-jsm.csv"
# file_path = "../data/accounts/vanguard-ira-jmm.csv"
# file_path = "../data/accounts/troweprice-401k-jmm.csv"
# file_path = "../data/accounts/mndcp-457b-jsm.csv"
# file_path = "../data/accounts/fidelity-ira-jmm.csv"
file_path = "../data/accounts/cigna-hsa-jmm.csv"
header, data_rows = read_csv_rows(file_path)

# Create DataFrame with existing converters
df = pd.DataFrame(data_rows, columns=header)
df