In [None]:
# # Install the xlrd library
%pip install xlrd
%pip install xlwt
%pip install pandas
%pip install openpyxl

In [None]:
import numpy as np
import pandas as pd
import os
import xlrd
import xlwt
import re

In [None]:
excel_files = [f for f in os.listdir("data/")]

In [None]:
excel_files

In [None]:
def extract_rdo_number(filename):
    try:
        # Use regular expressions to find the numeric part of the RDO number
        match = re.search(r'RDO No\. (\d+)\w? - (.+)\.?(?:xls|xlsx)?', filename, re.IGNORECASE)
        if match:
            return int(match.group(1))  # Extract the number and convert to integer
        else:
            return float('inf')
    except (ValueError, IndexError) as e:
        print(f"Error processing filename: {filename} - {e}")
        return float('inf')

# Sort the list using the extracted RDO number
sorted_files = sorted(excel_files, key=extract_rdo_number)

In [None]:
sorted_files

In [None]:
excel_files.sort()
excel_files

In [None]:
def xls_to_df(filename, base_dir="data/"):
    filepath = os.path.join(base_dir, filename)

    # Check file extension and specify engine if necessary
    if filename.lower().endswith('.xls'):
        test = pd.ExcelFile(filepath, engine='xlrd')  # Use xlrd for .xls files
    else:
        test = pd.ExcelFile(filepath, engine='openpyxl')  # Use openpyxl for .xlsx files

    sheet_names = test.sheet_names
    last_sheet_name = None
    
    # Sort the sheet names if they follow the 'Sheet' naming pattern
    sheet_names = sorted([name for name in sheet_names if name.strip().lower().startswith('sheet')],
                         key=lambda name: int(re.search(r'\d+', name).group()))
    
    # Select the last sheet that matches the pattern
    if sheet_names:
        last_sheet_name = sheet_names[-1]
    
    if last_sheet_name:
        df = pd.read_excel(filepath, sheet_name=last_sheet_name, header=None)
        return df
    else:
        print(f"No matching sheets found in {filename}")
        return None

In [None]:
def clean_value(value, feature=False):
    try:
        float_value = float(value)
        return round(float_value, 3)
    except (ValueError, TypeError):
        value = str(value)
        if value == 'nan':
            return ''
        if value is not None:
            value = re.sub(r"^\s*:\s*", "", value.strip())
            if not feature:
                value = re.sub(r"(D\.?\s*O\s*\.?\s*No|Effec(?:t)?ivity Date)\s*.*", "", value, flags=re.IGNORECASE).strip()
            value = re.sub(r'^no\.\s*\d+\s*-\s*', '', value, flags=re.IGNORECASE).strip()
            value = re.sub(r"\s*-*\s*(\s*\(cont\s*\.\)|(?:\()?\s*continued\s*(?:\)?)|(?:\()?\s*continuation\s*(?:\))?|(?:\()?\s*continaution\s*(?:\))?)", "", value, flags=re.IGNORECASE).strip()
            value = re.sub(r'[\s_]+$', '', value)
            return value
        return value

In [None]:
def extract_value(pattern, text):
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return None

In [None]:
def find_column_headers(df, index, proximity_window=6, debug=False):
    import re
    
    headers = {
        'street_name_index': None,
        'vicinity_index': None,
        'classification_index': None,
        'zv_sq_m_index': None
    }

    headers_max_offset = {
        'street_name_index': -1,
        'vicinity_index': -1,
        'classification_index': -1,
        'zv_sq_m_index': -1
    }

    column_texts = {}
    extend_search = False
    offset = 0

    zv_pattern_holder = None
    zv_offset_holder = None
    
    classification_pattern_holder = None
        
    while offset < proximity_window:
        current_index = index + offset
        if current_index >= len(df):
            break
        
        row = df.iloc[current_index]

        for col_index, cell in enumerate(row):
            cell_value = str(cell)
            if col_index not in column_texts:
                column_texts[col_index] = cell_value
            else:
                column_texts[col_index] += ' ' + cell_value

        if debug:
            print(f"Row {current_index}: {column_texts}")

        # Check each column's combined text for header patterns
        for col_index, combined_text in column_texts.items():
            if headers['street_name_index'] is None:
                if re.search(
                    r"(S\s*T\s*R\s*E\s*E\s*T\s*N\s*A\s*M\s*E|"
                    r"S\s*U\s*B\s*D\s*I\s*V\s*I\s*S\s*I\s*O\s*N|"
                    r"C\s*O\s*N\s*D\s*O\s*M\s*I\s*N\s*I\s*U\s*M)",
                    combined_text, re.IGNORECASE):
                    headers['street_name_index'] = col_index
                    headers_max_offset['street_name_index'] = offset
                    if debug:
                        print(f"max offset updated: {current_index}")

            if headers['vicinity_index'] is None:
                if re.search(r"V\s*I\s*C\s*I\s*N\s*I\s*T\s*Y", combined_text, re.IGNORECASE):
                    headers['vicinity_index'] = col_index
                    headers_max_offset['vicinity_index'] = offset
                    if debug:
                        print(f"max offset updated: {current_index}")

            if headers['classification_index'] is None:
                if re.search(
                    r"CLASS(?:IFICATION)?|"
                    r"C\s*L\s*A\s*S\s*S\s*I\s*F\s*I\s*C\s*A\s*T\s*I\s*O\s*N",
                    combined_text, re.IGNORECASE | re.DOTALL):
                    headers['classification_index'] = col_index
                    headers_max_offset['classification_index'] = offset
                    if debug:
                        print(f"max offset updated: {current_index}")
                    extend_search = True  # Flag to extend the search

            if headers['zv_sq_m_index'] is None or headers['zv_sq_m_index'] < col_index:
                zv_pattern = (
                    r"\d+(?:ST|ND|RD|TH)\s+(?:REVISION|Rev)(?:.*Z\.?V\.?.*SQ.*M\.?)?|"
                    r"(?:\d+(?:ST|ND|RD|TH)\s+REVISION|Rev\s+ZV\s+/?.*SQ\.?\s*M\.?)|"
                    r"(?:Z|2)\.?V\.?.*SQ.*M\.?|FINAL"
                )
                match = re.search(zv_pattern, combined_text, re.IGNORECASE) 
                if match:
                    headers['zv_sq_m_index'] = col_index
                    headers_max_offset['zv_sq_m_index'] = offset
                    if debug:
                        print(f"max offset updated: {current_index}")
                    
                    if not zv_pattern_holder: # if this is the first one
                        zv_pattern_holder = match
                        zv_offset_holder = offset
                        headers['zv_sq_m_index'] = None
                        extend_search = True  # extend the search
                    elif zv_pattern_holder == match: # if new pattern is the same, get previous values
                        headers_max_offset['zv_sq_m_index'] = zv_offset_holder
                        
                    # if the new match is different
                    

        if extend_search:
            if debug:
                print("Extending search")
            offset -= 2
            index += 2
            extend_search = False

        offset += 1

    # If all headers were found, determine the maximum offset used
    if all(value is not None for value in headers.values()):
        max_offset_used = max(headers_max_offset.values())
        if debug:
            print(f"Headers found within proximity window up to row {index + max_offset_used}")
            print(f"Header indices: {headers}")
        return True, headers, index + max_offset_used
    else:
        if debug:
            print(f"Headers not found within proximity window starting at index {index}")
        return False, None, index

In [None]:
def is_header_row(row):
    header_patterns = [
        r"(S\s*T\s*R\s*E\s*E\s*T\s*N\s*A\s*M\s*E|S\s*U\s*B\s*D\s*I\s*V\s*I\s*S\s*I\s*O\s*N|C\s*O\s*N\s*D\s*O\s*M\s*I\s*N\s*I\s*U\s*M)",
        r"V.*I.*C.*I.*N.*I.*T.*Y",
        r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N",
        r"ZV.*SQ.*M|3rd\s*Rev|FINAL"
        
    ]
    
    found = {pattern: False for pattern in header_patterns}
    
    for cell in row:
        cell_value = str(cell)
        for pattern in header_patterns:
            if re.search(pattern, cell_value, re.IGNORECASE):
                found[pattern] = True
            
    
    return all(found.values())

In [None]:
do = False
off = 0
a = 7
while off < 5:
    ca = a + off
    print(f"Ca, i: {ca}, {off}")
    if do:
        off -= 2
        ca += 2
        do = False
    off += 1

In [None]:
def find_location_components(df, index, proximity_window=3, current_province=None, current_city=None, current_barangay=None, debug=False):
    if debug:
        print(f"\nRunning find_location_components")
    last_matched_index = index
    initial_index = index
    expecting_values = False  # Flag to indicate we are expecting values in subsequent rows after combined labels
    found_any = False  # Flag to check if any location component is found
    
    extend_search = False
    offset = 0
    
    
    province_index = None
    city_index = None
    barangay_index = None
    barangay_holder = None
    city_holder = None
    
    while offset < proximity_window:
        
        current_index = index + offset
        if current_index >= len(df):
            break
        current_row = df.iloc[current_index]
        combined_current_row = ''.join(map(str, current_row.dropna())).strip()
        non_null_cells = current_row.dropna().astype(str).tolist()
        
        if debug:
            print("\n")
            # print(f"Row {current_index}: {non_null_cells}")
            # print(f"Expeting values: {expecting_values}")
            print(f"Searching row: {offset+1}/{proximity_window}")

        # Check if this row contains the combined labels
        if not expecting_values and any(re.search(r"PROVINCE\s*/\s*CITY\s*/\s*MUNICIPALITY\s*/\s*BARANGAYS", cell, re.IGNORECASE) for cell in non_null_cells):
            expecting_values = True
            if debug:
                print(f"Combined labels found at row {current_index}")
            
            # offset += 1
            continue  # Move to the next row to read values

        # If we're expecting values after combined labels
        if expecting_values:
            # Iterate over cells to find values starting with ":"
            for cell in non_null_cells:
                cell = cell.strip()
                if debug:
                    print(f"Cell: {non_null_cells}")
                    
                if cell.startswith(":"):
                    value = cell.lstrip(":").strip()
                    if not current_province:
                        current_province = clean_value(value)
                        found_any = True
                        if debug:
                            print(f"Province found: {current_province}")
                    elif not current_city:
                        current_city = clean_value(value)
                        found_any = True
                        if debug:
                            print(f"City/Municipality found: {current_city}")
                    elif not current_barangay:
                        current_barangay = clean_value(value)
                        found_any = True
                        if debug:
                            print(f"Barangay found: {current_barangay}")
            last_matched_index = current_index
            # If all components have values (either found now or already had values), we can return
            # if (current_province and current_city and current_barangay) or offset == proximity_window - 1:
            if all([current_province and current_city and current_barangay]):
                return current_province, current_city, current_barangay, last_matched_index
            if offset == proximity_window - 1:
                return current_province, current_city, current_barangay, initial_index
            
            offset += 1
            continue  # Continue to next row to find remaining components
        
        
        # Original logic for separate labels
        else:
            if combined_current_row.lower().startswith("district"):
                if debug:
                    print(f"Skipping row {current_index} as it starts with 'district'") 
                offset += 1
                continue
            # Check for Province
            province = extract_value(r"Province\s*(?::|\s|of)?\s*(.*)", combined_current_row)
            if province:
                current_province = clean_value(province)
                found_any = True
                extend_search = True
                last_matched_index = initial_index = province_index = current_index
                if debug:
                    print(f"Province match found in row {current_index}: {current_province}")

            # Check for City/Municipality
            city = extract_value(r"(?:(?!City,)(?:City|Municipality))(?:\s*\/\s*(?:City|Municipality))?\s*[:\s]?\s*(.+)", combined_current_row)
            if city:
                current_city = clean_value(city)
                found_any = True
                extend_search = True
                last_matched_index = initial_index = city_index = current_index
                if debug:
                    print(f"City/Municipality match found in row {current_index}: {current_city}")

            # Check for Barangay/Zone
            barangay = extract_value(r"(?:Barangays|Zone|Barangay)(?:\s*\/\s*(?:Barangays|Zone|Barangay))?\s*[:\s]?\s*(.+)", combined_current_row)
            # Check if the extracted barangay value contains a phrase like "along barangay road"
            if barangay and re.search(r".*\s*(?:along\s*)?barangay.*road.*", combined_current_row, re.IGNORECASE):
                # print(f"Discarding match due to 'along barangay road' pattern: {barangay}")
                barangay = None
            if barangay:
                current_barangay = clean_value(barangay)
                found_any = True
                extend_search = True
                last_matched_index = initial_index = barangay_index = current_index
                if debug:
                    print(f"Barangay/Zone match found in row {current_index}: {current_barangay}")
           
            if extend_search:
                # print("Extending search")
                offset -= 2
                index += 2
                extend_search = False     

            # If we've found any component, we can check if we've reached the proximity window or if all components are found
            if found_any and all([current_province and current_city and current_barangay]):
                # if barangay index is before province index, look for a province pa, and if we find, overwrite
                if barangay_index is not None and province_index is not None and barangay_index < province_index and not barangay_holder:
                    if debug:
                        print("Extending search for new baranagay")
                    barangay_holder = current_barangay
                    current_barangay = None
                    offset -= 1
                    index += 2
                    continue
                # Similarly, if city index is before province index, look for a province and overwrite
                if city_index is not None and province_index is not None and city_index < province_index and not city_holder:
                    if debug:
                        print("Extending search for new city")
                    city_holder = current_city
                    current_city = None
                    offset -= 1
                    index += 2
                    continue
                if debug:
                    print(f"Found all location components! Last matched index: {last_matched_index}")
                return current_province, current_city, current_barangay, last_matched_index            
            
            if offset == proximity_window - 1:
                if found_any:
                   return current_province, current_city, current_barangay, last_matched_index   
                if barangay_holder:
                    current_barangay = barangay_holder
                    return current_province, current_city, current_barangay, last_matched_index
                if city_holder:
                    current_city = city_holder
                    return current_province, current_city, current_barangay, last_matched_index
                return current_province, current_city, current_barangay, initial_index
            
            # if extend_search:
            #     # print("Extending search")
            #     offset -= 2
            #     index += 2
            #     extend_search = False
        
        offset += 1
        if not expecting_values and not found_any:
            break

    return current_province, current_city, current_barangay, last_matched_index


#Code for Standard Format

Standard Format follows: 

- each tables have complete Province, Barangay, City/mun and are in succeeding rows.
                
- Table headers can be can be read with this regex pattern: 

          r"(STREET NAME|SUBDIVISION|CONDOMINIUM)",
          r"V.*I.*C.*I.*N.*I.*T.*Y",
          r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N",
          r"ZV.*SQ.*M|3rd\s*Rev"

In [None]:
def main(df, debug=False, start=0, end=-1, debug_location=False, debug_header=False):
    if end == -1:
        final_index = len(df)
    else:
        final_index = end
    index = start
    count = 0
    new_df = pd.DataFrame(columns=['Province', 'City/Municipality', 'Barangay', 
                                   'Street/Subdivision', 'Vicinity', 'Classification', 'ZV/SQM'])
    
    PROXIMITY_WINDOW = 2  # Increased to accommodate different formats
    
    # 
    current_province = None
    current_city = None
    current_barangay = None
    header_indices = None
    
    continuation = False 
    # prev is previous table, holder is local table
    prev_col1 = None
    prev_vicinity = None
    prev_classification = None
    prev_zvsqm = None

    # while index < len(df):
    while index < final_index:
        current_province_new, current_city_new, current_barangay_new, index  = find_location_components(
            df, index, proximity_window=PROXIMITY_WINDOW, debug=debug_location)
        # Update current location components with any new values
        
        found_components = any([current_province_new, current_city_new, current_barangay_new])
        if found_components and debug:
            print(f"Location components found: {current_province_new}, {current_city_new}, {current_barangay_new}")

        # Attempt to find headers starting from the last matched index
        found_headers, header_indices_new, new_index = find_column_headers(df, index, debug=debug_header)
        if debug:
            print(f"Column headers found: {header_indices_new}")
        
        # if we (kinda) confident we have a table
        if found_headers and found_components:
            if current_province_new == current_province:
                continuation = True
            else:
                continuation = False
            current_province = current_province_new if current_province_new else current_province
            current_city = current_city_new if current_city_new else current_city
            current_barangay = current_barangay_new if current_barangay_new else current_barangay
            
            # Update header indices
            header_indices = header_indices_new
            index = new_index  # Move index to after headers

            # Start processing data rows
            count += 1
            if debug:
                print(f'Processing table {count}\n')

            age = 0
            MAX_AGE = 4
            col1_holder = None
            vicinity_holder = None

            all_other_vicinity = None

            while index < final_index and age < MAX_AGE:
                # TODO: Check the types of all variables because some NaN stuff and floats and inconsistent and yeah
                row = df.iloc[index]
                
                vicinity = 'Test u should not see this pop up pls'
                # Extract data using the header indices
                col1 = row.iloc[header_indices['street_name_index']]
                classification = row.iloc[header_indices['classification_index']]
                zv = row.iloc[header_indices['zv_sq_m_index']]
                
                # Check for double column
                if isinstance(header_indices['vicinity_index'], int):
                    vicinity = row.iloc[header_indices['vicinity_index']]
                elif isinstance(header_indices['vicinity_index'], list):
                    vicinity1 = str(row.iloc[header_indices['vicinity_index'][0]])
                    vicinity2 = str(row.iloc[header_indices['vicinity_index'][1]])
                    if vicinity1 == 'nan':
                        vicinity = vicinity2
                    elif vicinity2 == 'nan':
                        vicinity = vicinity1
                    else:
                        vicinity = f"{vicinity1}, {vicinity2}"
                
                if debug:
                    print(f"Data row at index {index}: {[col1, vicinity, classification, zv]}")
                    # print(f'vicinity header index: {header_indices["vicinity_index"]}')
                    
                

                # Check for new location components in the current row
                current_province_new_in_row, current_city_new_in_row, current_barangay_new_in_row, new_index_2 = find_location_components(
                    df, index, proximity_window=PROXIMITY_WINDOW, debug=debug_location)
                # found_headers_in_row, header_indices_in_row, new_index_in_row = find_column_headers(df, index, debug=debug)
                
                # if col1 index is not zone/barangay pattern
                # if barangay index is before province index, look for a province pa, and if we find, overwrite
                found_headers_in_row, header_indices_in_row, new_index_in_row = find_column_headers(df, new_index_2, debug=debug_header)
                
                combined_row = ''.join(map(str, row[[header_indices['classification_index'], header_indices['zv_sq_m_index']]].dropna())).strip()
                valid_data_row = clean_value(combined_row)
                if debug and any([current_province_new_in_row, current_city_new_in_row, current_barangay_new_in_row]):
                    if current_province_new_in_row:
                        print(f"Province found: {current_province_new_in_row}")
                    if current_city_new_in_row:
                        print(f"City/Municipality found: {current_city_new_in_row}")
                    if current_barangay_new_in_row:
                        print(f"Barangay found: {current_barangay_new_in_row}")
                    if found_headers_in_row:
                        print("Column headers found")
                    print(f"Valid data row: {valid_data_row}")
                        
                
                # TODO: revisit this condition for new table
                # print(f"Validity: {valid_data_row}")
                if not valid_data_row and (any([current_province_new_in_row, current_city_new_in_row, current_barangay_new_in_row]) and found_headers_in_row):
                    # End current table processing
                    if debug:
                        print(f"New location and headers found at index {index}. Ending current table and starting new table.")
                        print(f"current_province: {current_province_new_in_row}, current_city: {current_city_new_in_row}, current_barangay: {current_barangay_new_in_row}")
                    # Update location components
                    if current_province_new_in_row == current_province:
                        continuation = True
                    else:
                        continuation = False
                    current_province = current_province_new_in_row if current_province_new_in_row else current_province
                    current_city = current_city_new_in_row if current_city_new_in_row else current_city
                    current_barangay = current_barangay_new_in_row if current_barangay_new_in_row else current_barangay
                    
                    # Update headers
                    header_indices = header_indices_in_row
                    index = new_index_in_row  # Move index to after headers
                    
                    # Reset variables
                    age = 0
                    col1_holder = None
                    vicinity_holder = None
                    count += 1  # Increment table count
                    if debug:
                        print('\n' + '#' * 60)
                        print('#' + ' ' * 58 + '#')
                        print('#{:^58}#'.format(f'>>> PROCESSING TABLE {count} <<<'))
                        print('#' + ' ' * 58 + '#')
                        print('#' * 60 + '\n')

                    continue  # Start processing new table from updated index
                
                cleaned_row = clean_value(''.join(map(str, row.dropna())).strip())
                row_is_valid = (not ((pd.isnull(classification) or str(classification).strip() == '') and (pd.isnull(zv) or str(zv).strip() == ''))) and str(cleaned_row).strip() 
                if not row_is_valid:
                    index += 1
                    age += 1
                    continue 
                    
                # Check if both classification and ZV/SQM are empty
                if (pd.isnull(classification) or str(classification).strip() == '') and (pd.isnull(zv) or str(zv).strip() == ''):
                    index += 1
                    age += 1
                    continue  
                
                if str(classification).strip().lower() == 'nan' and not str("ZV / SQ. M").replace('.', '', 1).isdigit():
                    index += 1
                    continue
                
                
                # Checking for empty col1
                null_col1 = pd.isna(col1) or not str(col1).strip()
                if null_col1:  
                    if continuation:
                        col1 = col1_holder if not (pd.isna(col1_holder) or not str(col1_holder).strip()) else prev_col1 
                    elif not (pd.isna(col1_holder) or not str(col1_holder).strip()):
                        col1 = col1_holder
                else:
                    col1_holder = col1
                
                if isinstance(col1, str):
                    col1_stripped_upper = col1.strip().upper()
                    is_all_other = col1_stripped_upper.startswith("ALL OTHER")
                else:
                    col1_stripped_upper = ''
                    is_all_other = False
                               
                # Check if 'vicinity' is null or empty
                null_vicinity = pd.isna(vicinity) or not str(vicinity).strip()
                if null_vicinity: # if vicinity is null
                    if continuation: # if the table is a continuation
                        if not (pd.isna(prev_col1) and pd.isna(col1)) and prev_col1 != col1: # if new col1
                            vicinity_holder = vicinity #update the holder
                        else:
                            vicinity = vicinity_holder if not (pd.isna(vicinity_holder) or not str(vicinity_holder).strip()) else prev_vicinity
                    elif not (pd.isna(vicinity_holder) or not str(vicinity_holder).strip()):
                        if not (pd.isna(prev_col1) and pd.isna(col1)) and prev_col1 != col1: # if new col1
                            vicinity_holder = vicinity
                        else:
                            vicinity = vicinity_holder 
                else:
                    vicinity_holder = vicinity  
                    
                # 'ALL OTHER' logic
                if is_all_other:
                    if not null_vicinity:
                        all_other_vicinity = vicinity
                    if all_other_vicinity:
                        vicinity = all_other_vicinity
                    else:
                        vicinity = ''
                        if debug:
                            print(f"'col1' starts with 'ALL OTHER'. Setting 'vicinity' to blank.")
                else:
                    all_other_vicinity = None
                
                def is_dash_string(var):
                    return isinstance(var, str) and re.fullmatch(r"\-+", var) is not None
                
                matches = sum(is_dash_string(var) for var in [col1, vicinity, classification, zv])
                if matches >= 3:
                    index += 1
                    age += 1
                    continue

                # Append to new DataFrame
                # TODO: check if cleaning features is necessary
                new_df.loc[len(new_df)] = [
                    current_province, 
                    current_city, 
                    current_barangay, 
                    clean_value(col1, feature=True), 
                    clean_value(vicinity, feature=True), 
                    clean_value(classification, feature=True), 
                    clean_value(zv, feature=True)
                ]
                
                prev_col1 = col1
                prev_vicinity = vicinity
                prev_classification = classification
                prev_zvsqm = zv
                
                if debug:
                    print(new_df.loc[len(new_df)-1])
                    print("\n-------\n")
                
                index += 1
                age = 0
            continue  # Proceed to next iteration of the main loop
        else:
            index += 1  # No headers found, move to the next row
    if debug:
        print(f"Total tables processed: {count}")
    return new_df


In [None]:
str('nan').strip().lower() == 'nan' and not str("ZV / SQ. M").replace('.', '', 1).isdigit()

In [None]:
excel_files = [f for f in os.listdir("data/") if os.path.isfile(os.path.join("data/", f))]
excel_files[110:]

In [None]:
# Running for non-QC
os.makedirs("Output", exist_ok=True)
excel_files = [f for f in os.listdir("data/") if os.path.isfile(os.path.join("data/", f))]

for excel in excel_files:
    print(f'Processing {excel}')
    df = xls_to_df(excel)
    processed = main(df)
    
    # Split the filename and the extension
    filename, extension = os.path.splitext(excel)
    
    if extension.lower() == '.xls':
        normalized_filename = f"{filename}.xlsx"  
    elif extension.lower() == '.xlsx':
        normalized_filename = excel  
    else:
        print(f"Unsupported file format for {excel}. Skipping...")
        continue
    
    output_path = os.path.join("Output", f"Updated_{normalized_filename}")
    processed.to_excel(output_path, index=False)

    print(f'Processed file saved as: {output_path}')

print("done w all nice")

In [None]:
output_files = [f for f in os.listdir("Output")]
for output_file in output_files:
    file = pd.read_excel(f'Output/{output_file}')
    if len(file) < 2:
        print(output_file[:-5])

Note These files above have the vicinity in weird unmerged basta

In [None]:
file = 'RDO No. 43 - Pasig City.xls'
test = pd.ExcelFile(f"data/{file}")

sheet_names = sorted([name for name in test.sheet_names if name.strip().lower().startswith('sheet')], key=lambda name: int(re.search(r'\d+', name).group()))
last_sheet_name = sheet_names[-1] if sheet_names else None

df = pd.read_excel("data/"+file, sheet_name=last_sheet_name, header=None)

In [None]:
main(df, debug=True, start=1411, end=1555, debug_location=False, debug_header=True)

In [None]:
main(df, debug=True, start=0, end=555, debug_location=False, debug_header=True)

In [None]:
main(df, debug=True, start=0, end=1555, debug_location=False, debug_header=True)

In [None]:
df.iloc[927]

In [None]:
find_location_components(df, 927, debug=True, proximity_window=2)

In [None]:
find_column_headers(df, 1414, debug=True)

In [None]:
rdo_list = [
    "Updated_RDO NO. 113A - West Davao City",
    "Updated_RDO No. 24 - Valenzuela City",
    "Updated_RDO No. 26 - Malabon-Navotas",
    "Updated_RDO No. 29 - Tondo-San Nicolas",
    "Updated_RDO No. 30 - Binondo",
    "Updated_RDO No. 48 - West Makati",
    "Updated_RDO No. 49 - North Makati City",
    "Updated_RDO No. 74 - Iloilo City, Iloilo",
    "Updated_RDO No. 80 - Mandaue City, Cebu",
    "Updated_RDO No. 87 - Catbalogan City",
    "Updated_RDO No. 98 - Cagayan de Oro City, Misamis Oriental"
]

sorted(rdo_list, key=extract_rdo_number)

In [None]:
df.iloc[1416]

In [None]:
def main_recursive(df, debug=False, start=0, end=-1, debug_location=False, debug_header=False):
    """
    A recursive version of the main function for processing tables in a DataFrame.
    
    Args:
        df (pandas.DataFrame): Input DataFrame to process
        debug (bool): Enable debug output
        start (int): Starting index in DataFrame
        end (int): Ending index in DataFrame (-1 for end of DataFrame)
        debug_location (bool): Enable debug output for location components
        debug_header (bool): Enable debug output for header finding
        
    Returns:
        pandas.DataFrame: Processed data with structured columns
    """
    if end == -1:
        final_index = len(df)
    else:
        final_index = end
    
    # Initialize the new DataFrame
    new_df = pd.DataFrame(columns=['Province', 'City/Municipality', 'Barangay', 
                                   'Street/Subdivision', 'Vicinity', 'Classification', 'ZV/SQM'])
    
    # Start recursion
    result_df, count = process_tables_recursive(
        df, 
        new_df, 
        start, 
        final_index, 
        None, None, None,  # Current province, city, barangay
        None, None, None, None,  # Previous values
        0,  # Table count
        False,  # Continuation flag
        debug, debug_location, debug_header
    )
    
    if debug:
        print(f"Total tables processed: {count}")
    
    return result_df

def process_tables_recursive(df, result_df, index, final_index, 
                           current_province, current_city, current_barangay,
                           prev_col1, prev_vicinity, prev_classification, prev_zvsqm,
                           count, continuation, debug, debug_location, debug_header):
    """
    Recursive function to process tables in the DataFrame.
    
    Args:
        df (pandas.DataFrame): Input DataFrame
        result_df (pandas.DataFrame): Output DataFrame being built
        index (int): Current position in the DataFrame
        final_index (int): End position to process
        current_province, current_city, current_barangay: Current location components
        prev_col1, prev_vicinity, prev_classification, prev_zvsqm: Previous values
        count (int): Count of tables processed so far
        continuation (bool): Whether this table is a continuation of the previous
        debug, debug_location, debug_header (bool): Debug flags
        
    Returns:
        tuple: (result_df, count) - The updated DataFrame and table count
    """
    # Base case: end of DataFrame or reached final index
    if index >= final_index:
        return result_df, count
    
    PROXIMITY_WINDOW = 2  # Increased to accommodate different formats
    
    # Find location components
    current_province_new, current_city_new, current_barangay_new, index = find_location_components(
        df, index, proximity_window=PROXIMITY_WINDOW, debug=debug_location)
    
    found_components = any([current_province_new, current_city_new, current_barangay_new])
    if found_components and debug:
        print(f"Location components found: {current_province_new}, {current_city_new}, {current_barangay_new}")
    
    # Attempt to find headers starting from the last matched index
    found_headers, header_indices, new_index = find_column_headers(df, index, debug=debug_header)
    if debug:
        print(f"Column headers found: {header_indices}")
    
    # If we found both headers and location components
    if found_headers and found_components:
        # Update continuation flag
        if current_province_new == current_province:
            continuation = True
        else:
            continuation = False
        
        # Update current location
        current_province = current_province_new if current_province_new else current_province
        current_city = current_city_new if current_city_new else current_city
        current_barangay = current_barangay_new if current_barangay_new else current_barangay
        
        # Move index to after headers
        index = new_index
        
        # Increment table count
        count += 1
        if debug:
            print(f'Processing table {count}\n')
        
        # Process the data rows
        result = process_data_rows_recursive(
            df, result_df, index, final_index, 
            current_province, current_city, current_barangay, 
            header_indices, 
            prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
            continuation, 
            debug, debug_location, debug_header
        )
        
        # Unpack the result
        result_df = result[0]
        index = result[1]
        new_prev_col1 = result[2]
        new_prev_vicinity = result[3]
        new_prev_classification = result[4]
        new_prev_zvsqm = result[5]
        found_new_table = result[6]
        
        # If a new table was found, handle it
        if found_new_table:
            # Get the new table information
            new_province = result[7]
            new_city = result[8]
            new_barangay = result[9]
            new_headers = result[10]
            
            # Update for the new table found within data rows
            if debug:
                print('\n' + '#' * 60)
                print('#' + ' ' * 58 + '#')
                print('#{:^58}#'.format(f'>>> PROCESSING TABLE {count+1} <<<'))
                print('#' + ' ' * 58 + '#')
                print('#' * 60 + '\n')
            
            # Recursively process the new table
            return process_tables_recursive(
                df, result_df, index, final_index, 
                new_province, new_city, new_barangay, 
                new_prev_col1, new_prev_vicinity, new_prev_classification, new_prev_zvsqm, 
                count + 1, # Increment table count
                True if new_province == current_province else False, # Set continuation flag
                debug, debug_location, debug_header
            )
        
        # Continue to next table (no new table was found within the data rows)
        return process_tables_recursive(
            df, result_df, index, final_index, 
            current_province, current_city, current_barangay, 
            new_prev_col1, new_prev_vicinity, new_prev_classification, new_prev_zvsqm, 
            count, continuation, 
            debug, debug_location, debug_header
        )
    else:
        # No headers or location found, move to next row
        return process_tables_recursive(
            df, result_df, index + 1, final_index, 
            current_province, current_city, current_barangay, 
            prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
            count, continuation, 
            debug, debug_location, debug_header
        )

def process_data_rows_recursive(df, result_df, index, final_index, 
                              current_province, current_city, current_barangay, 
                              header_indices, 
                              prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
                              continuation, 
                              debug, debug_location, debug_header,
                              age=0, col1_holder=None, vicinity_holder=None, all_other_vicinity=None):
    """
    Recursively process data rows within a found table.
    
    Returns:
        tuple: (result_df, next_index, prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
                found_new_table, new_province, new_city, new_barangay, new_headers)
    """
    # Base cases
    MAX_AGE = 4
    if index >= final_index:
        return (result_df, index, prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
                False, None, None, None, None)
    
    if age >= MAX_AGE:
        return (result_df, index, prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
                False, None, None, None, None)
    
    # Get current row
    row = df.iloc[index]
    
    # Extract data using the header indices
    col1 = row.iloc[header_indices['street_name_index']]
    classification = row.iloc[header_indices['classification_index']]
    zv = row.iloc[header_indices['zv_sq_m_index']]
    
    # Handle vicinity (which could be a single column or two columns)
    vicinity = 'Test u should not see this pop up pls'
    if isinstance(header_indices['vicinity_index'], int):
        vicinity = row.iloc[header_indices['vicinity_index']]
    elif isinstance(header_indices['vicinity_index'], list):
        vicinity1 = str(row.iloc[header_indices['vicinity_index'][0]])
        vicinity2 = str(row.iloc[header_indices['vicinity_index'][1]])
        if vicinity1 == 'nan':
            vicinity = vicinity2
        elif vicinity2 == 'nan':
            vicinity = vicinity1
        else:
            vicinity = f"{vicinity1}, {vicinity2}"
    
    if debug:
        print(f"Data row at index {index}: {[col1, vicinity, classification, zv]}")
    
    # Check for new location components in the current row
    current_province_new_in_row, current_city_new_in_row, current_barangay_new_in_row, new_index_2 = find_location_components(
        df, index, debug=debug_location)
    
    # Check for new headers
    found_headers_in_row, header_indices_in_row, new_index_in_row = find_column_headers(df, new_index_2, debug=debug_header)
    
    # Check if row has valid data
    combined_row = ''.join(map(str, row[[header_indices['classification_index'], header_indices['zv_sq_m_index']]].dropna())).strip()
    valid_data_row = clean_value(combined_row)
    
    if debug and any([current_province_new_in_row, current_city_new_in_row, current_barangay_new_in_row]):
        if current_province_new_in_row:
            print(f"Province found: {current_province_new_in_row}")
        if current_city_new_in_row:
            print(f"City/Municipality found: {current_city_new_in_row}")
        if current_barangay_new_in_row:
            print(f"Barangay found: {current_barangay_new_in_row}")
        if found_headers_in_row:
            print("Column headers found")
        print(f"Valid data row: {valid_data_row}")
    
    # Check if we found a new table
    if not valid_data_row and (any([current_province_new_in_row, current_city_new_in_row, current_barangay_new_in_row]) and found_headers_in_row):
        if debug:
            print(f"New location and headers found at index {index}. Ending current table and starting new table.")
            print(f"current_province: {current_province_new_in_row}, current_city: {current_city_new_in_row}, current_barangay: {current_barangay_new_in_row}")
        
        # Update location components
        next_province = current_province_new_in_row if current_province_new_in_row else current_province
        next_city = current_city_new_in_row if current_city_new_in_row else current_city
        next_barangay = current_barangay_new_in_row if current_barangay_new_in_row else current_barangay
        
        # Return signal to start a new table
        return (result_df, new_index_in_row, prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
                True, next_province, next_city, next_barangay, header_indices_in_row)
    
    # Check if the row contains valid data
    cleaned_row = clean_value(''.join(map(str, row.dropna())).strip())
    row_is_valid = (not ((pd.isnull(classification) or str(classification).strip() == '') and 
                        (pd.isnull(zv) or str(zv).strip() == ''))) and str(cleaned_row).strip()
    
    if not row_is_valid:
        # Skip this row and increase age
        return process_data_rows_recursive(
            df, result_df, index + 1, final_index, 
            current_province, current_city, current_barangay, 
            header_indices, 
            prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
            continuation, 
            debug, debug_location, debug_header, 
            age + 1, col1_holder, vicinity_holder, all_other_vicinity
        )
    
    # Check if both classification and ZV/SQM are empty
    if (pd.isnull(classification) or str(classification).strip() == '') and (pd.isnull(zv) or str(zv).strip() == ''):
        # Skip this row and increase age
        return process_data_rows_recursive(
            df, result_df, index + 1, final_index, 
            current_province, current_city, current_barangay, 
            header_indices, 
            prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
            continuation, 
            debug, debug_location, debug_header, 
            age + 1, col1_holder, vicinity_holder, all_other_vicinity
        )
    
    if str(classification).strip().lower() == 'nan' and not str("ZV / SQ. M").replace('.', '', 1).isdigit():
        # Skip this row
        return process_data_rows_recursive(
            df, result_df, index + 1, final_index, 
            current_province, current_city, current_barangay, 
            header_indices, 
            prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
            continuation, 
            debug, debug_location, debug_header, 
            age, col1_holder, vicinity_holder, all_other_vicinity
        )
    
    # Handle col1
    null_col1 = pd.isna(col1) or not str(col1).strip()
    if null_col1:
        if continuation:
            col1 = col1_holder if not (pd.isna(col1_holder) or not str(col1_holder).strip()) else prev_col1
        elif not (pd.isna(col1_holder) or not str(col1_holder).strip()):
            col1 = col1_holder
    else:
        col1_holder = col1
    
    # Check for "ALL OTHER" in col1
    if isinstance(col1, str):
        col1_stripped_upper = col1.strip().upper()
        is_all_other = col1_stripped_upper.startswith("ALL OTHER")
    else:
        col1_stripped_upper = ''
        is_all_other = False
    
    # Handle vicinity
    null_vicinity = pd.isna(vicinity) or not str(vicinity).strip()
    if null_vicinity:
        if continuation:
            if not (pd.isna(prev_col1) and pd.isna(col1)) and prev_col1 != col1:
                vicinity_holder = vicinity
            else:
                vicinity = vicinity_holder if not (pd.isna(vicinity_holder) or not str(vicinity_holder).strip()) else prev_vicinity
        elif not (pd.isna(vicinity_holder) or not str(vicinity_holder).strip()):
            if not (pd.isna(prev_col1) and pd.isna(col1)) and prev_col1 != col1:
                vicinity_holder = vicinity
            else:
                vicinity = vicinity_holder
    else:
        vicinity_holder = vicinity
    
    # 'ALL OTHER' logic
    if is_all_other:
        if not null_vicinity:
            all_other_vicinity = vicinity
        if all_other_vicinity:
            vicinity = all_other_vicinity
        else:
            vicinity = ''
            if debug:
                print(f"'col1' starts with 'ALL OTHER'. Setting 'vicinity' to blank.")
    else:
        all_other_vicinity = None
    
    # Check for dash strings
    def is_dash_string(var):
        return isinstance(var, str) and re.fullmatch(r"\-+", var) is not None
    
    matches = sum(is_dash_string(var) for var in [col1, vicinity, classification, zv])
    if matches >= 3:
        # Skip this row and increase age
        return process_data_rows_recursive(
            df, result_df, index + 1, final_index, 
            current_province, current_city, current_barangay, 
            header_indices, 
            prev_col1, prev_vicinity, prev_classification, prev_zvsqm, 
            continuation, 
            debug, debug_location, debug_header, 
            age + 1, col1_holder, vicinity_holder, all_other_vicinity
        )
    
    # Append to result DataFrame
    result_df.loc[len(result_df)] = [
        current_province,
        current_city,
        current_barangay,
        clean_value(col1, feature=True),
        clean_value(vicinity, feature=True),
        clean_value(classification, feature=True),
        clean_value(zv, feature=True)
    ]
    
    # Update previous values
    new_prev_col1 = col1
    new_prev_vicinity = vicinity
    new_prev_classification = classification
    new_prev_zvsqm = zv
    
    if debug:
        print(result_df.loc[len(result_df)-1])
        print("\n-------\n")
    
    # Continue with next row (age reset to 0)
    return process_data_rows_recursive(
        df, result_df, index + 1, final_index, 
        current_province, current_city, current_barangay, 
        header_indices, 
        new_prev_col1, new_prev_vicinity, new_prev_classification, new_prev_zvsqm, 
        continuation, 
        debug, debug_location, debug_header, 
        0, col1_holder, vicinity_holder, all_other_vicinity
    )

In [None]:
main(df, debug=False, start=0, end=555, debug_location=False, debug_header=False)

In [None]:
main_recursive(df, debug=False, start=0, end=555, debug_location=False, debug_header=False)