In [1]:
import numpy as np
import pandas as pd
import os
import xlrd
import re

In [2]:
excel_files = [f for f in os.listdir("data")]

In [3]:
file = 'RDO No. 1 - Laoag City, Ilocos Norte.xls'
test = pd.ExcelFile(f'data/{file}')

In [4]:
sheet_names = sorted([name for name in test.sheet_names if name.strip().lower().startswith('sheet')], key=lambda name: int(re.search(r'\d+', name).group()))
last_sheet_name = sheet_names[-1] if sheet_names else None

In [5]:
last_sheet_name

'Sheet 9 (DO 047-2023)'

In [6]:
df = pd.read_excel("data/"+file, sheet_name=last_sheet_name, header=None)

In [7]:
df

Unnamed: 0,0,1,2,3
0,Republic of the Philippines,,,
1,DEPARTMENT OF FINANCE,,,
2,MANILA,,,
3,,,,
4,DEPARTMENT ORDER NO. 047 - 2023,,,
...,...,...,...,...
25341,TAXES SHALL BE (1) THE FAIR MARKET VALUE ...,,,
25342,(i.e. ZONAL VALUES) OR (2.) THE FAIR MAR...,,,
25343,"PROVINCIAL/CITY/MUNICIPAL ASSESSOR, WHICH...",,,
25344,,,,


In [8]:
def test_function():
    print("All conditions met! Function called.\n")

def clean_value(value):
    value = re.sub(r"^\s*:\s*", "", value.strip())
    value = re.sub(r"(DO No|Effectivity Date)\s*.*", "", value, flags=re.IGNORECASE).strip() # do i add --continued here?
    return value

In [30]:
def main(df):
    #TODO: Ask if remove * in classifcaiton and -- contineu in street or smth
    index = 0
    count = 0
    new_df = pd.DataFrame(columns=['Province', 'City/Municipality', 'Barangay', 'Street/Subdivision', 'Vicinity', 'Classification', 'ZV/SQM'])
    while index < len(df):
        row = df.iloc[index]
        combined_row = ''.join(map(str, row.dropna())).strip()
        
        # Initialize flags
        province_match_flag = False
        city_match_flag = False
        barangay_match_flag = False
        additional_info_flag = False
        
        # Search for "Province :" pattern
        province_pattern = re.search(r"province\s*(.*)", combined_row, re.IGNORECASE)
        if province_pattern:
            province_match_flag = True
            province_value = province_pattern.group(1)  # Capture the text after "Province :"
            province_value = clean_value(province_value)
            
            print(f"Province match found in row {index}: {province_value}")
        
        city_value = None    
        # Check if the next row has "City" or "Municipality"
        if index + 1 < len(df):
            next_row = ''.join(map(str, df.iloc[index + 1].dropna())).strip()
            cleaned_row = clean_value(next_row)
            city_pattern = re.search(r"(?:City|Municipality)(?:\s*/\s*(?:City|Municipality))?\s*[:\s]?\s*(.+)", cleaned_row, re.IGNORECASE)
            # city_pattern = re.search(r"(?:City|Municipality|Municipality\s*/\s*City|City\s*/\s*Municipality)\s*[:\s]?\s*(\w+)", next_row, re.IGNORECASE)
            if city_pattern:
                city_match_flag = True
                city_value = city_pattern.group(1) # Capture the text after "City :" or "Municipality :"
                # city_value = clean_value(city_value)
                
                print(f"City / Municipality match found in row {index + 1}: {city_value}")
                
        barangay_value = None
        # Check if the row after that has "Barangay" or "Zone"
        if index + 2 < len(df):
            following_row = ''.join(map(str, df.iloc[index + 2].dropna())).strip()
            cleaned_row = clean_value(following_row)
            barangay_pattern = re.search(r"(?:Barangay|Zone)(?:\s*/\s*(?:Barangay|Zone))?\s*[:\s]?\s*(.+)", cleaned_row, re.IGNORECASE)
            if barangay_pattern:
                barangay_match_flag = True
                barangay_value = barangay_pattern.group(1).strip()  # Capture the text after "Barangay :" or "Zone :"
                # barangay_value = clean_value(barangay_value)
                
                print(f"Barangay / Zone match found in row {index + 2}: {barangay_value}")
        
        # Only check for additional information if the primary geographic data is found
        if province_match_flag and city_match_flag and barangay_match_flag:
            # Check specific columns for additional information in the next 3 rows
            for offset in range(3, 6):
                if index + offset < len(df):
                    # check for table header
                    subsequent_row = df.iloc[index + offset]
                    
                    # Initialize variables to store the column indices of matches
                    street_name_index = None
                    vicinity_index = None
                    classification_index = None
                    zv_sq_m_index = None
        
                    # Iterate over each column in the row
                    for col_index in range(len(subsequent_row)):
                        cell_value = str(subsequent_row.iloc[col_index])
                        
                        # Check each column independently
                        if re.search(r"(STREET NAME|SUBDIVISION|CONDOMINIUM)", cell_value, re.IGNORECASE):
                            street_name_index = col_index  # Store the index of the match for street name/subdivision/condominium
                        
                        if re.search(r"V.*I.*C.*I.*N.*I.*T.*Y", cell_value, re.IGNORECASE):
                            vicinity_index = col_index  # Store the index of the match for vicinity
                        
                        if re.search(r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N", cell_value, re.IGNORECASE | re.DOTALL):
                            """Made changes so that it reads CLASS on classification column on last column - Raevy"""
                            classification_index = col_index  # Store the index of the match for classification
                        
                        if re.search(r"ZV.*SQ.*M|3rd\s*Rev", cell_value, re.IGNORECASE): 
                            """Made changes so that it reads 3rd Rev on last column - Raevy"""
                            zv_sq_m_index = col_index  # Store the index of the match for ZV SQ M
        
                    # If all matches are found, set the additional_info_flag to True
                    if street_name_index is not None and vicinity_index is not None and classification_index is not None and zv_sq_m_index is not None:
                        additional_info_flag = True
                        # Print the matched indices for debugging purposes
                        print(f"Additional information match found in row {index + offset}")
                        print(f"STREET NAME found in column {street_name_index}")
                        print(f"VICINITY found in column {vicinity_index}")
                        print(f"CLASSIFICATION found in column {classification_index}")
                        print(f"ZV SQ M found in column {zv_sq_m_index}")
                        
                        # Skip to the row after the current match
                        index += offset + 1
                        
                        break  # No need to check further if a match is found
    
        # Call the function only if all conditions are true
        if province_match_flag and city_match_flag and barangay_match_flag and additional_info_flag:
            test_function()
            count += 1
            print(f'{count}\n\n')
            
            # start reading here: age_counter
            age = 0
            col1_holder = None
            vicinity_holder = None 
            while age < 2:
                # print(f"index: {index} and {len(df)}")
                row = df.iloc[index]
                
                col1 = row.iloc[street_name_index]
                vicinity = row.iloc[vicinity_index]
                classification = str(row.iloc[classification_index])
                zv = row.iloc[zv_sq_m_index]
                test = [col1, vicinity, classification, zv]
                print(f"base row: {test}")
                
                if (pd.isnull(classification) or classification.strip() == '') and pd.isnull(zv):
                    age += 1
                    index += 1
                    # print("aging")
                    continue
                
                age = 0
                
                if pd.isnull(col1) or col1.strip() == '':
                    col1 = col1_holder
                if pd.isnull(vicinity) or vicinity.strip() == '':
                    vicinity = vicinity_holder
                
                col1_holder = col1
                vicinity_holder = vicinity
                
                new_df.loc[len(new_df)] = [province_value, city_value, barangay_value, col1, vicinity, classification, zv]
                index += 1
            
            index += 1 
            continue
            
        index += 1
    print(count)
    return new_df

In [31]:
def xls_to_df(filename):
    test = pd.ExcelFile("data/" + filename)
    sheet_names = test.sheet_names
    last_sheet_name = None
    
    # Extract sheet names that start with 'sheet'
    sheet_names = sorted([name for name in sheet_names if name.strip().lower().startswith('sheet')],
                         key=lambda name: int(re.search(r'\d+', name).group()))
    
    if sheet_names:
        last_sheet_name = sheet_names[-1]
    
    if last_sheet_name:
        df = pd.read_excel("data/" + filename, sheet_name=last_sheet_name, header=None)
        return df
    else:
        print(f"No matching sheets found in {filename}")


In [11]:
test = main(df)

City / Municipality match found in row 15: ,  ILOCOS NORTE,  REVENUE  REGION  NO.  1 - CALASIAO,
Barangay / Zone match found in row 22: s  or  areas  and   shall
Barangay / Zone match found in row 23: or   areas,    subject  to   automatic
City / Municipality match found in row 31: , Ilocos Norte, Revenue Region No. 1 - Calasiao, Pangasinan.
City / Municipality match found in row 36: assessor and (2) the gross selling
Barangay / Zone match found in row 108: sA45  Kangkong
City / Municipality match found in row 118: , ILOCOS NORTE
Province match found in row 120: ILOCOS NORTE
City / Municipality match found in row 121: LAOAG CITY
Barangay / Zone match found in row 122: NO. 1 - SAN LORENZO (POBLACION)
Additional information match found in row 124
STREET NAME found in column 0
VICINITY found in column 1
CLASSIFICATION found in column 2
ZV SQ M found in column 3
All conditions met! Function called.

1


City / Municipality match found in row 160: , ILOCOS NORTE
Province match found in row 

# Running for all

In [32]:
for excel in excel_files[22:23]:
    df = xls_to_df(excel)
    processed = main(df)
    processed.to_excel(f"Output/{excel}.xlsx")

Barangay / Zone match found in row 17: s or areas and shall determine the fair market value of real properties located in each zone or
City / Municipality match found in row 28: assessor and (2) the gross selling price/consideration as shown in the
Barangay / Zone match found in row 91: sA45  Kangkong
Province match found in row 102: QUEZON
City / Municipality match found in row 103: ATIMONAN
Barangay / Zone match found in row 104: POBLACION (ZONE 1)D.O. No.037-2022
Additional information match found in row 106
STREET NAME found in column 0
VICINITY found in column 1
CLASSIFICATION found in column 2
ZV SQ M found in column 3
All conditions met! Function called.

1


index: 107 and 7268
base row: ['MAHARLIKA HIGHWAY', nan, 'CR', 7800]
index: 108 and 7268
base row: [nan, nan, 'RR', 6750]
index: 109 and 7268
base row: [nan, nan, 'GP', 4000]
index: 110 and 7268
base row: [nan, nan, 'CL', 3000]
index: 111 and 7268
base row: [nan, 'MENDOZA COMPOUND', 'RR', 6000]
index: 112 and 7268
base row:

IndexError: single positional indexer is out-of-bounds

In [None]:
output_files = [f for f in os.listdir("Output")]
for output_file in output_files:
    file = pd.read_excel(f'Output/{output_file}')
    if len(file) < 2:
        print(output_file[:-5])

In [18]:
excel_files[22:23]

['RDO No. 61 - Gumaca, South Quezon.xls']

In [168]:
import pandas as pd
import re

def clean_value(value):
    if value is not None:
        value = str(value)
        value = re.sub(r"^\s*:\s*", "", value.strip())
        # value = re.sub(r"(D.*O.* No|Effectivity Date|--continued)\s*.*", "", value, flags=re.IGNORECASE).strip()
        value = re.sub(r"(D(?:\.O|O)(?:\.) No|Effectivity Date)\s*.*", "", value, flags=re.IGNORECASE).strip()
        return value
    return value

def test_function():
    """Placeholder function for debugging or logging purposes."""
    pass

def extract_value(pattern, text):
    """
    Searches for the given pattern in the text and returns the matched group.
    """
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return None

def find_column_headers(df, index):
    """
    Searches for the column headers in the DataFrame starting from the given index.
    Returns a tuple (found, header_indices, new_index) where:
    - found: True if headers are found
    - header_indices: dictionary mapping header names to column indices
    - new_index: the index after the headers
    """
    headers = {
        'street_name_index': None,
        'vicinity_index': None,
        'classification_index': None,
        'zv_sq_m_index': None
    }
    
    # Search within the next 3 rows for headers
    for offset in range(3):
        current_index = index + offset
        if current_index >= len(df):
            break
        row = df.iloc[current_index]
        
        for col_index, cell in enumerate(row):
            cell_value = str(cell)
            
            if headers['street_name_index'] is None and re.search(r"(STREET NAME|SUBDIVISION|CONDOMINIUM)", cell_value, re.IGNORECASE):
                headers['street_name_index'] = col_index
            
            if headers['vicinity_index'] is None and re.search(r"V.*I.*C.*I.*N.*I.*T.*Y", cell_value, re.IGNORECASE):
                headers['vicinity_index'] = col_index
            
            if headers['classification_index'] is None and re.search(r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N", cell_value, re.IGNORECASE | re.DOTALL):
                headers['classification_index'] = col_index
            
            if headers['zv_sq_m_index'] is None and re.search(r"ZV.*SQ.*M|3rd\s*Rev", cell_value, re.IGNORECASE): 
                headers['zv_sq_m_index'] = col_index
        
        if all(value is not None for value in headers.values()):
            print(f"Headers found at index {current_index}")
            print(f"Header indices: {headers}")
            return True, headers, current_index + 1
    
    return False, None, index

def is_header_row(row):
    """
    Checks if the given row contains the headers.
    Returns True if it does, False otherwise.
    """
    header_patterns = [
        r"(STREET NAME|SUBDIVISION|CONDOMINIUM)",
        r"V.*I.*C.*I.*N.*I.*T.*Y",
        r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N",
        r"ZV.*SQ.*M|3rd\s*Rev"
    ]
    
    found = {pattern: False for pattern in header_patterns}
    
    for cell in row:
        cell_value = str(cell)
        for pattern in header_patterns:
            if re.search(pattern, cell_value, re.IGNORECASE):
                found[pattern] = True
    
    return all(found.values())

def main(df):
    """
    Processes the DataFrame to extract structured data based on Province, City/Municipality,
    Barangay/Zone, and additional information like Street/Subdivision, Vicinity, Classification,
    and ZV/SQM.
    
    This version performs separate checks for each geographic component without nested conditionals,
    allowing them to appear in any order within a specified proximity.
    """
    index = 0
    count = 0
    new_df = pd.DataFrame(columns=['Province', 'City/Municipality', 'Barangay', 
                                   'Street/Subdivision', 'Vicinity', 'Classification', 'ZV/SQM'])
    
    # Define the proximity window (number of rows within which to search for related fields)
    PROXIMITY_WINDOW = 5
    
    while index < len(df):
        row = df.iloc[index]
        combined_row = ''.join(map(str, row.dropna())).strip()
        
        # Initialize variables to store current location information
        current_province = None
        current_city = None
        current_barangay = None
        
        # Flags to check if the location components have been found within the proximity window
        found_province = False
        found_city = False
        found_barangay = False
        
        print(f"=================\nROW INDEX: {index}\n=================")
        # Search within the proximity window for Province, City/Municipality, and Barangay/Zone
        for offset in range(PROXIMITY_WINDOW):
            current_index = index + offset
            if current_index >= len(df):
                break
            current_row = df.iloc[current_index]
            combined_current_row = ''.join(map(str, current_row.dropna())).strip()
            
            # Check for Province
            if not found_province:
                province = extract_value(r"province\s*[:\s]?\s*(.*)", combined_current_row)
                if province:
                    current_province = clean_value(province)
                    found_province = True
                    print(f"Province match found in row {current_index}: {current_province}")
                    
            # Check for City/Municipality
            if not found_city:
                city = extract_value(r"(?:City|Municipality)(?:\s*/\s*(?:City|Municipality))?\s*[:\s]?\s*(.+)", combined_current_row)
                if city:
                    current_city = clean_value(city)
                    found_city = True
                    print(f"City/Municipality match found in row {current_index}: {current_city}")
            
            # Check for Barangay/Zone
            if not found_barangay:
                barangay = extract_value(r"(?:Barangay|Zone)(?:\s*/\s*(?:Barangay|Zone))?\s*[:\s]?\s*(.+)", combined_current_row)
                if barangay:
                    current_barangay = clean_value(barangay)
                    found_barangay = True
                    print(f"Barangay/Zone match found in row {current_index}: {current_barangay}")

            # print(f"\nOffset: {offset}")
            # print(f"province: {current_province}, city: {current_city}, barangay: {current_barangay}\n")
            
            # If all three components are found, break out of the proximity search
            if found_province and found_city and found_barangay:
                break
        
        # Check if all location components were found
        if found_province and found_city and found_barangay:
            # print("HELLLLO")

            # Update the main index to the last matched row in the proximity window
            last_matched_index = index + offset
            print(df.iloc[last_matched_index])
            index = last_matched_index + 1  # Move past the last matched row
            print(df.iloc[index])
            
            # Now, find the column headers starting from the current index
            found_headers, header_indices, new_index = find_column_headers(df, index)
            # print(f'found headers: {found_headers}\nheaders index: {header_indices}\nnew_index: {new_index}')
            
            if found_headers:
                index = new_index  # Move index to after headers
                
                # Start reading data rows
                count += 1
                print(f'Processing table {count}\n')
                
                age = 0
                MAX_AGE = 3
                
                col1_holder = None
                vicinity_holder = None
                while index < len(df) and age < MAX_AGE:
                    row = df.iloc[index]
                    
                    # Check if the current row is a header row, indicating a new table
                    if is_header_row(row):
                        print(f"Header row detected at index {index}. Ending current table.")
                        break  # Exit the data row processing loop to handle the new table
                    
                    # Extract data using the header indices
                    col1 = row.iloc[header_indices['street_name_index']]
                    vicinity = row.iloc[header_indices['vicinity_index']]
                    classification = row.iloc[header_indices['classification_index']]
                    zv = row.iloc[header_indices['zv_sq_m_index']]
                    print(f"Data row at index {index}: {[col1, vicinity, classification, zv]}")
                    
                    # Check if both classification and ZV/SQM are empty
                    if (pd.isnull(classification) or str(classification).strip() == '') and \
                       (pd.isnull(zv) or str(zv).strip() == ''):
                        index += 1
                        continue  # Skip this row as it doesn't contain valid data
                    
                    # Handle missing values by carrying forward the last known values
                    if pd.isnull(col1) or str(col1).strip() == '':
                        col1 = col1_holder
                    else:
                        col1_holder = col1
                        
                    if pd.isnull(vicinity) or str(vicinity).strip() == '':
                        vicinity = vicinity_holder
                    else:
                        vicinity_holder = vicinity
                    
                    # Append the extracted data to new_df
                    new_df.loc[len(new_df)] = [
                        current_province, 
                        current_city, 
                        current_barangay, 
                        clean_value(col1), 
                        clean_value(vicinity), 
                        clean_value(classification), 
                        clean_value(zv)
                    ]
                    
                    index += 1  # Move to the next row
                
                # After processing data rows, continue to the next iteration to find new tables
                continue
        
        # If not all location components were found, move to the next row
        index += 1
    
    print(f"Total tables processed: {count}")
    return new_df


In [169]:
import pandas as pd
import re

def clean_value(value):
    if value is not None:
        value = str(value)
        value = re.sub(r"^\s*:\s*", "", value.strip())
        # value = re.sub(r"(D.*O.* No|Effectivity Date|--continued)\s*.*", "", value, flags=re.IGNORECASE).strip()
        value = re.sub(r"(D(?:\.O|O)(?:\.) No|Effectivity Date)\s*.*", "", value, flags=re.IGNORECASE).strip()
        return value
    return value

def test_function():
    pass

def extract_value(pattern, text):
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return None

def find_column_headers(df, index, proximity_window=6):
    headers = {
        'street_name_index': None,
        'vicinity_index': None,
        'classification_index': None,
        'zv_sq_m_index': None
    }
    
    for offset in range(proximity_window-3, proximity_window):
        current_index = index + offset
        if current_index >= len(df):
            break
        row = df.iloc[current_index]
        
        for col_index, cell in enumerate(row):
            cell_value = str(cell)
            
            if headers['street_name_index'] is None and re.search(r"(STREET NAME|SUBDIVISION|CONDOMINIUM)", cell_value, re.IGNORECASE):
                headers['street_name_index'] = col_index
            
            if headers['vicinity_index'] is None and re.search(r"V.*I.*C.*I.*N.*I.*T.*Y", cell_value, re.IGNORECASE):
                headers['vicinity_index'] = col_index
            
            if headers['classification_index'] is None and re.search(r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N", cell_value, re.IGNORECASE | re.DOTALL):
                headers['classification_index'] = col_index
            
            if headers['zv_sq_m_index'] is None and re.search(r"ZV.*SQ.*M|3rd\s*Rev", cell_value, re.IGNORECASE): 
                headers['zv_sq_m_index'] = col_index
        
        if all(value is not None for value in headers.values()):
            print(f"Headers found at index {current_index}")
            print(f"Header indices: {headers}")
            return True, headers, current_index + 1
    
    return False, None, index

def is_header_row(row):
    header_patterns = [
        r"(STREET NAME|SUBDIVISION|CONDOMINIUM)",
        r"V.*I.*C.*I.*N.*I.*T.*Y",
        r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N",
        r"ZV.*SQ.*M|3rd\s*Rev"
    ]
    
    found = {pattern: False for pattern in header_patterns}
    
    for cell in row:
        cell_value = str(cell)
        for pattern in header_patterns:
            if re.search(pattern, cell_value, re.IGNORECASE):
                found[pattern] = True
    
    return all(found.values())

def find_location_components(df, index, proximity_window=3):
    """
    Searches within the proximity window for Province, City/Municipality, and Barangay/Zone.
    
    Returns:
        A tuple containing:
        - current_province (str or None)
        - current_city (str or None)
        - current_barangay (str or None)
        - last_matched_index (int): The index of the last matched component
    """
    current_province = None
    current_city = None
    current_barangay = None
    
    found_province = False
    found_city = False
    found_barangay = False
    
    last_matched_index = index
    
    for offset in range(proximity_window):
        current_index = index + offset
        if current_index >= len(df):
            break
        current_row = df.iloc[current_index]
        combined_current_row = ''.join(map(str, current_row.dropna())).strip()
        
        # Check for Province
        if not found_province:
            province = extract_value(r"province\s*[:\s]?\s*(.*)", combined_current_row)
            if province:
                current_province = clean_value(province)
                found_province = True
                last_matched_index = current_index
                print(f"Province match found in row {current_index}: {current_province}")
                
        # Check for City/Municipality
        if not found_city:
            city = extract_value(r"(?:City|Municipality)(?:\s*/\s*(?:City|Municipality))?\s*[:\s]?\s*(.+)", combined_current_row)
            if city:
                current_city = clean_value(city)
                found_city = True
                last_matched_index = current_index
                print(f"City/Municipality match found in row {current_index}: {current_city}")
        
        # Check for Barangay/Zone
        if not found_barangay:
            barangay = extract_value(r"(?:Barangay|Zone)(?:\s*/\s*(?:Barangay|Zone))?\s*[:\s]?\s*(.+)", combined_current_row)
            if barangay:
                current_barangay = clean_value(barangay)
                found_barangay = True
                last_matched_index = current_index
                print(f"Barangay/Zone match found in row {current_index}: {current_barangay}")

        # If all three components are found, no need to continue searching
        if found_province and found_city and found_barangay:
            break
    
    return current_province, current_city, current_barangay, last_matched_index

def main(df):
    """
    Processes the DataFrame to extract structured data based on Province, City/Municipality,
    Barangay/Zone, and additional information like Street/Subdivision, Vicinity, Classification,
    and ZV/SQM.
    
    This version handles the case where location components and headers may appear while reading a table,
    indicating the start of a new table. If both are found, it ends the current table, updates the location
    components, and starts reading the new table.
    """
    index = 0
    count = 0
    new_df = pd.DataFrame(columns=['Province', 'City/Municipality', 'Barangay', 
                                   'Street/Subdivision', 'Vicinity', 'Classification', 'ZV/SQM'])
    
    PROXIMITY_WINDOW = 3
    
    current_province = None
    current_city = None
    current_barangay = None
    header_indices = None
    
    while index < len(df):
        # Check if location components and headers can be found starting from current index
        current_province_new, current_city_new, current_barangay_new, last_matched_index = find_location_components(df, index)
        found_headers, header_indices_new, new_index = find_column_headers(df, index)
        
        if (current_province_new and current_city_new and current_barangay_new) and found_headers:
            # Update location components
            current_province = current_province_new if current_province_new else current_province
            current_city = current_city_new if current_city_new else current_city
            current_barangay = current_barangay_new if current_barangay_new else current_barangay
            
            # Update headers
            header_indices = header_indices_new
            index = new_index  # Move index to after headers
            
            # Start processing data rows
            count += 1
            print(f'Processing table {count}\n')
            
            age = 0
            MAX_AGE = 3
            col1_holder = None
            vicinity_holder = None
            
            while index < len(df) and age < MAX_AGE:
                row = df.iloc[index]
                
                # Check if new location components and headers are found in the current row
                current_province_new_in_row, current_city_new_in_row, current_barangay_new_in_row, _ = find_location_components(df, index)
                found_headers_in_row, header_indices_in_row, new_index_in_row = find_column_headers(df, index)
                
                print(f"Row: {row}\n")
                # if (current_province_new_in_row or current_city_new_in_row or current_barangay_new_in_row) and found_headers_in_row:
                if (current_province_new_in_row and current_city_new_in_row and current_barangay_new_in_row) and found_headers_in_row:
                    # End current table processing
                    print(f"New location and headers found at index {index}. Ending current table and starting new table.")
                    print(f"current_province: {current_province_new_in_row}, current_city: {current_city_new_in_row}, current_barangay: {current_barangay_new_in_row}")
                    # Update location components
                    current_province = current_province_new_in_row if current_province_new_in_row else current_province
                    current_city = current_city_new_in_row if current_city_new_in_row else current_city
                    current_barangay = current_barangay_new_in_row if current_barangay_new_in_row else current_barangay
                    
                    # Update headers
                    header_indices = header_indices_in_row
                    index = new_index_in_row  # Move index to after headers
                    
                    # Reset variables
                    age = 0
                    col1_holder = None
                    vicinity_holder = None
                    count += 1  # Increment table count
                    print(f'Processing table {count}\n')
                    continue  # Start processing new table from updated index
                
                # Continue processing current table
                
                # Check if the current row is a header row, indicating a new table
                if is_header_row(row):
                    print(f"Header row detected at index {index}. Ending current table.")
                    index += 1
                    break  # Exit the data row processing loop to handle the new table
                
                # Extract data using the header indices
                col1 = row.iloc[header_indices['street_name_index']]
                vicinity = row.iloc[header_indices['vicinity_index']]
                classification = row.iloc[header_indices['classification_index']]
                zv = row.iloc[header_indices['zv_sq_m_index']]
                print(f"Data row at index {index}: {[col1, vicinity, classification, zv]}")
                
                # Check if both classification and ZV/SQM are empty
                if (pd.isnull(classification) or str(classification).strip() == '') and \
                   (pd.isnull(zv) or str(zv).strip() == ''):
                    index += 1
                    age += 1
                    continue  # Skip this row as it doesn't contain valid data
                
                # Handle missing values by carrying forward the last known values
                if pd.isnull(col1) or str(col1).strip() == '':
                    col1 = col1_holder
                else:
                    col1_holder = col1
                    
                if pd.isnull(vicinity) or str(vicinity).strip() == '':
                    vicinity = vicinity_holder
                else:
                    vicinity_holder = vicinity
                
                # Append the extracted data to new_df
                new_df.loc[len(new_df)] = [
                    current_province, 
                    current_city, 
                    current_barangay, 
                    clean_value(col1), 
                    clean_value(vicinity), 
                    clean_value(classification), 
                    clean_value(zv)
                ]
                
                index += 1  # Move to the next row
                age = 0  # Reset age since valid data was found
            continue  # After processing data rows, continue to the next iteration to find new tables
        
        else:
            index += 1  # Move to next row if no new location components and headers are found
    print(f"Total tables processed: {count}")
    return new_df


In [170]:
for excel in excel_files[22:23]:
    df = xls_to_df(excel)
    processed = main(df)
    processed.to_excel(f"Output/{excel}.xlsx")

Barangay/Zone match found in row 17: s or areas and shall determine the fair market value of real properties located in each zone or
Barangay/Zone match found in row 17: s or areas and shall determine the fair market value of real properties located in each zone or
Barangay/Zone match found in row 17: s or areas and shall determine the fair market value of real properties located in each zone or
City/Municipality match found in row 28: assessor and (2) the gross selling price/consideration as shown in the
City/Municipality match found in row 28: assessor and (2) the gross selling price/consideration as shown in the
City/Municipality match found in row 28: assessor and (2) the gross selling price/consideration as shown in the
Barangay/Zone match found in row 91: sA45  Kangkong
Barangay/Zone match found in row 91: sA45  Kangkong
Barangay/Zone match found in row 91: sA45  Kangkong
Province match found in row 102: QUEZON
Province match found in row 102: QUEZON
City/Municipality match found

In [171]:
processed

Unnamed: 0,Province,City/Municipality,Barangay,Street/Subdivision,Vicinity,Classification,ZV/SQM
0,QUEZON,ATIMONAN,POBLACION (ZONE 1),MAHARLIKA HIGHWAY,,CR,7800
1,QUEZON,ATIMONAN,POBLACION (ZONE 1),MAHARLIKA HIGHWAY,,RR,6750
2,QUEZON,ATIMONAN,POBLACION (ZONE 1),MAHARLIKA HIGHWAY,,GP,4000
3,QUEZON,ATIMONAN,POBLACION (ZONE 1),MAHARLIKA HIGHWAY,,CL,3000
4,QUEZON,ATIMONAN,POBLACION (ZONE 1),MAHARLIKA HIGHWAY,MENDOZA COMPOUND,RR,6000
...,...,...,...,...,...,...,...
4050,QUEZON,TAGKAWAYAN,VICTORIA,ALL LOTS,,A4,117.5
4051,QUEZON,TAGKAWAYAN,VICTORIA,ALL LOTS,,A5,70
4052,QUEZON,TAGKAWAYAN,VICTORIA,ALL LOTS,,A6,225
4053,QUEZON,TAGKAWAYAN,VICTORIA,ALL LOTS,,A10,35


In [75]:
row = df.iloc[106]
row

0    STREET NAME / SUBDIVISION/ CONDOMINIUM
1                                  VICINITY
2                                     CLASS
3                                   3rd Rev
Name: 106, dtype: object

In [82]:
for col_index, cell in enumerate(row):
    print(f"{col_index}: {cell}")

0: STREET NAME / SUBDIVISION/ CONDOMINIUM
1: VICINITY
2: CLASS
3: 3rd Rev


In [86]:
x = re.search(r"ZV.*SQ.*M|3rd\s*Rev", cell, re.IGNORECASE)
print(x)

<re.Match object; span=(0, 7), match='3rd Rev'>


In [97]:
def find_column_headers(df, index):
    headers = {
        'street_name_index': None,
        'vicinity_index': None,
        'classification_index': None,
        'zv_sq_m_index': None
    }
    
    # Search within the next 3 rows for headers
    for offset in range(3):
        current_index = index + offset
        if current_index >= len(df):
            break
        row = df.iloc[current_index]
        
        for col_index, cell in enumerate(row):
            cell_value = str(cell)
            
            if headers['street_name_index'] is None and re.search(r"(STREET NAME|SUBDIVISION|CONDOMINIUM)", cell_value, re.IGNORECASE):
                headers['street_name_index'] = col_index
            
            if headers['vicinity_index'] is None and re.search(r"V.*I.*C.*I.*N.*I.*T.*Y", cell_value, re.IGNORECASE):
                headers['vicinity_index'] = col_index
            
            if headers['classification_index'] is None and re.search(r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N", cell_value, re.IGNORECASE | re.DOTALL):
                headers['classification_index'] = col_index
            
            if headers['zv_sq_m_index'] is None and re.search(r"ZV.*SQ.*M|3rd\s*Rev", cell_value, re.IGNORECASE): 
                headers['zv_sq_m_index'] = col_index
        
        if all(value is not None for value in headers.values()):
            print(f"Headers found at index {current_index}")
            print(f"Header indices: {headers}")
            return True, headers, current_index + 1
    
    return False, None, index

In [98]:
find_column_headers(df, 106)

Headers found at index 106
Header indices: {'street_name_index': 0, 'vicinity_index': 1, 'classification_index': 2, 'zv_sq_m_index': 3}


(True,
 {'street_name_index': 0,
  'vicinity_index': 1,
  'classification_index': 2,
  'zv_sq_m_index': 3},
 107)

In [96]:
bool(0)

False

In [105]:
import pandas as pd
import re

def clean_value(value):
    """Cleans the extracted text by removing unwanted characters."""
    return value.strip() if isinstance(value, str) else value

def test_function():
    """Placeholder function for debugging or logging purposes."""
    pass

def extract_value(pattern, text):
    """
    Searches for the given pattern in the text and returns the matched group.
    """
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    else:
        return None

def find_column_headers(df, index):
    """
    Searches for the column headers in the DataFrame starting from the given index.
    Returns a tuple (found, header_indices, new_index) where:
    - found: True if headers are found
    - header_indices: dictionary mapping header names to column indices
    - new_index: the index after the headers
    """
    headers = {
        'street_name_index': None,
        'vicinity_index': None,
        'classification_index': None,
        'zv_sq_m_index': None
    }
    
    # Search within the next 3 rows for headers
    for offset in range(3):
        current_index = index + offset
        if current_index >= len(df):
            break
        row = df.iloc[current_index]
        
        for col_index, cell in enumerate(row):
            cell_value = str(cell)
            
            if headers['street_name_index'] is None and re.search(r"(STREET NAME|SUBDIVISION|CONDOMINIUM)", cell_value, re.IGNORECASE):
                headers['street_name_index'] = col_index
            
            if headers['vicinity_index'] is None and re.search(r"V.*I.*C.*I.*N.*I.*T.*Y", cell_value, re.IGNORECASE):
                headers['vicinity_index'] = col_index
            
            if headers['classification_index'] is None and re.search(r"CLASS(?:IFICATION)?|C.*L.*A.*S.*S.*I.*F.*I.*C.*A.*T.*I.*O.*N", cell_value, re.IGNORECASE | re.DOTALL):
                headers['classification_index'] = col_index
            
            if headers['zv_sq_m_index'] is None and re.search(r"ZV.*SQ.*M|3rd\s*Rev", cell_value, re.IGNORECASE): 
                headers['zv_sq_m_index'] = col_index
        
        if all(value is not None for value in headers.values()):
            print(f"Headers found at index {current_index}")
            print(f"Header indices: {headers}")
            return True, headers, current_index + 1
    
    return False, None, index

def check_for_location_components(df, index, proximity_window=5):
    """
    Checks within a proximity window starting from the given index for location components.
    Returns a tuple (found_any, found_all, location_data, new_index)
    - found_any: True if any location component is found
    - found_all: True if all location components are found
    - location_data: Dictionary with location components
    - new_index: The index to continue from
    """
    found_province = False
    found_city = False
    found_barangay = False
    current_province = None
    current_city = None
    current_barangay = None
    
    for offset in range(proximity_window):
        current_index = index + offset
        if current_index >= len(df):
            break
        current_row = df.iloc[current_index]
        combined_current_row = ''.join(map(str, current_row.dropna())).strip()
        
        # Check for Province
        if not found_province:
            province = extract_value(r"province\s*[:\s]?\s*(.*)", combined_current_row)
            if province:
                current_province = clean_value(province)
                found_province = True
                print(f"Province match found in row {current_index}: {current_province}")
        
        # Check for City/Municipality
        if not found_city:
            city = extract_value(r"(?:City|Municipality)(?:\s*/\s*(?:City|Municipality))?\s*[:\s]?\s*(.+)", combined_current_row)
            if city:
                current_city = clean_value(city)
                found_city = True
                print(f"City/Municipality match found in row {current_index}: {current_city}")
        
        # Check for Barangay/Zone
        if not found_barangay:
            barangay = extract_value(r"(?:Barangay|Zone)(?:\s*/\s*(?:Barangay|Zone))?\s*[:\s]?\s*(.+)", combined_current_row)
            if barangay:
                current_barangay = clean_value(barangay)
                found_barangay = True
                print(f"Barangay/Zone match found in row {current_index}: {current_barangay}")
        
        if found_province or found_city or found_barangay:
            # Return the index where location component is found
            location_data = {
                'province': current_province,
                'city': current_city,
                'barangay': current_barangay
            }
            return True, found_province and found_city and found_barangay, location_data, current_index
    # If no location components found
    return False, False, None, index

def main(df):
    """
    Processes the DataFrame to extract structured data based on Province, City/Municipality,
    Barangay/Zone, and additional information like Street/Subdivision, Vicinity, Classification,
    and ZV/SQM.
    
    This version stops the current table processing when it encounters new location components,
    updates the variables accordingly, and begins a new table.
    """
    index = 0
    count = 0
    new_df = pd.DataFrame(columns=['Province', 'City/Municipality', 'Barangay', 
                                   'Street/Subdivision', 'Vicinity', 'Classification', 'ZV/SQM'])
    
    # Initialize variables to store current location information
    current_province = None
    current_city = None
    current_barangay = None
    
    while index < len(df):
        print(f"=================\nROW INDEX: {index}\n=================")
        # Before processing data, check if new location components are found
        found_any, found_all, location_data, new_index = check_for_location_components(df, index)
        if found_any:
            # If new location components are found, update variables
            if location_data['province']:
                current_province = location_data['province']
            if location_data['city']:
                current_city = location_data['city']
            if location_data['barangay']:
                current_barangay = location_data['barangay']
            index = new_index + 1  # Move index past the location component row(s)
            print(f"Updated location info at index {index}: Province={current_province}, City={current_city}, Barangay={current_barangay}")
            
            # Now, find the column headers starting from the current index
            found_headers, header_indices, new_index = find_column_headers(df, index)
            if found_headers:
                index = new_index  # Move index to after headers
                
                # Start reading data rows
                count += 1
                print(f'Processing table {count}\n')
                
                col1_holder = None
                vicinity_holder = None
                while index < len(df):
                    row = df.iloc[index]
                    
                    # Before processing the row, check if new location components are found
                    found_any, _, location_data, loc_index = check_for_location_components(df, index)
                    if found_any:
                        # New location components found, end current table processing
                        print(f"New location components found at index {index}. Ending current table.")
                        break  # Exit the data row processing loop to handle the new table
                    
                    # Extract data using the header indices
                    col1 = row.iloc[header_indices['street_name_index']]
                    vicinity = row.iloc[header_indices['vicinity_index']]
                    classification = row.iloc[header_indices['classification_index']]
                    zv = row.iloc[header_indices['zv_sq_m_index']]
                    print(f"Data row at index {index}: {[col1, vicinity, classification, zv]}")
                    
                    # Check if both classification and ZV/SQM are empty
                    if (pd.isnull(classification) or str(classification).strip() == '') and \
                       (pd.isnull(zv) or str(zv).strip() == ''):
                        index += 1
                        continue  # Skip this row as it doesn't contain valid data
                    
                    # Handle missing values by carrying forward the last known values
                    if pd.isnull(col1) or str(col1).strip() == '':
                        col1 = col1_holder
                    else:
                        col1_holder = col1
                        
                    if pd.isnull(vicinity) or str(vicinity).strip() == '':
                        vicinity = vicinity_holder
                    else:
                        vicinity_holder = vicinity
                    
                    # Append the extracted data to new_df
                    new_df.loc[len(new_df)] = [
                        current_province, 
                        current_city, 
                        current_barangay, 
                        clean_value(col1), 
                        clean_value(vicinity), 
                        clean_value(classification), 
                        clean_value(zv)
                    ]
                    
                    index += 1  # Move to the next row
                
                # After processing data rows, continue to the next iteration to find new tables
                continue
            else:
                # Headers not found, cannot process table, move to next index
                index += 1
                continue
        else:
            # No new location components found, move to next index
            index += 1
    
    print(f"Total tables processed: {count}")
    return new_df


In [106]:
for excel in excel_files[22:23]:
    df = xls_to_df(excel)
    processed = main(df)
    processed.to_excel(f"Output/{excel}.xlsx")

ROW INDEX: 0
ROW INDEX: 1
ROW INDEX: 2
ROW INDEX: 3
ROW INDEX: 4
ROW INDEX: 5
ROW INDEX: 6
ROW INDEX: 7
ROW INDEX: 8
ROW INDEX: 9
ROW INDEX: 10
ROW INDEX: 11
ROW INDEX: 12
ROW INDEX: 13
Barangay/Zone match found in row 17: s or areas and shall determine the fair market value of real properties located in each zone or
Updated location info at index 18: Province=None, City=None, Barangay=s or areas and shall determine the fair market value of real properties located in each zone or
ROW INDEX: 19
ROW INDEX: 20
ROW INDEX: 21
ROW INDEX: 22
ROW INDEX: 23
ROW INDEX: 24
City/Municipality match found in row 28: assessor and (2) the gross selling price/consideration as shown in the
Updated location info at index 29: Province=None, City=assessor and (2) the gross selling price/consideration as shown in the, Barangay=s or areas and shall determine the fair market value of real properties located in each zone or
ROW INDEX: 30
ROW INDEX: 31
ROW INDEX: 32
ROW INDEX: 33
ROW INDEX: 34
ROW INDEX: 35
ROW

In [107]:
processed

Unnamed: 0,Province,City/Municipality,Barangay,Street/Subdivision,Vicinity,Classification,ZV/SQM
0,QUEZON,assessor and (2) the gross selling price/consi...,POBLACION (ZONE 1)D.O. No.037-2022,MAHARLIKA HIGHWAY,,CR,7800.0
1,QUEZON,assessor and (2) the gross selling price/consi...,POBLACION (ZONE 1)D.O. No.037-2022,MAHARLIKA HIGHWAY,,RR,6750.0
2,QUEZON,assessor and (2) the gross selling price/consi...,POBLACION (ZONE 1)D.O. No.037-2022,MAHARLIKA HIGHWAY,,GP,4000.0
3,QUEZON,assessor and (2) the gross selling price/consi...,POBLACION (ZONE 1)D.O. No.037-2022,MAHARLIKA HIGHWAY,,CL,3000.0
4,QUEZON,assessor and (2) the gross selling price/consi...,POBLACION (ZONE 1)D.O. No.037-2022,MAHARLIKA HIGHWAY,MENDOZA COMPOUND,RR,6000.0
...,...,...,...,...,...,...,...
2493,QUEZON,assessor and (2) the gross selling price/consi...,VICTORIA,ALL LOTS,,A4,117.5
2494,QUEZON,assessor and (2) the gross selling price/consi...,VICTORIA,ALL LOTS,,A5,70
2495,QUEZON,assessor and (2) the gross selling price/consi...,VICTORIA,ALL LOTS,,A6,225
2496,QUEZON,assessor and (2) the gross selling price/consi...,VICTORIA,ALL LOTS,,A10,35
