In [1]:
from bs4 import BeautifulSoup
import pandas as pd
from pathlib import Path
import re
from tqdm import tqdm

def extract_info_from_html(file_path):
    """Extract all relevant information from a single HTML file."""
    with open(file_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f.read(), 'html.parser')
    
    # Initialize data dictionary
    data = {}
    
    # Basic Information
    basic_fields = {
        'state': 'ctl00_ContentPlaceHolder1_lblstate',
        'district': 'ctl00_ContentPlaceHolder1_lbldistrict',
        'block': 'ctl00_ContentPlaceHolder1_lblblock',
        'panchayat': 'ctl00_ContentPlaceHolder1_lblpanchayat',
        'sa_start_date': 'ctl00_ContentPlaceHolder1_lblSA_start_dt',
        'sa_end_date': 'ctl00_ContentPlaceHolder1_lblSA_end_dt',
        'gram_sabha_date': 'ctl00_ContentPlaceHolder1_lblGramSabha_dt',
        'public_hearing_date': 'ctl00_ContentPlaceHolder1_lblPublic_Hearing_dt'
    }
    
    # Extract basic fields
    for key, id_value in basic_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None
    
    # Financial Information
    financial_fields = {
        'sa_period_from': 'ctl00_ContentPlaceHolder1_lblSA_Period_From_Date',
        'sa_period_to': 'ctl00_ContentPlaceHolder1_lblSA_Period_To_Date',
        'wage_exp': 'ctl00_ContentPlaceHolder1_lblWage_exp',
        'material_exp': 'ctl00_ContentPlaceHolder1_lblmat_exp',
        'total_exp': 'ctl00_ContentPlaceHolder1_lbltotal_expen',
        'wage_given': 'ctl00_ContentPlaceHolder1_lblwage_given',
        'material_given': 'ctl00_ContentPlaceHolder1_lblmat_given',
        'total_given': 'ctl00_ContentPlaceHolder1_lbltotal_record_given'
    }
    
    # Extract financial fields
    for key, id_value in financial_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None
    
    # Work Information
    work_fields = {
        'total_works': 'ctl00_ContentPlaceHolder1_lbltot_work',
        'total_households': 'ctl00_ContentPlaceHolder1_lbltot_hh',
        'works_verified': 'ctl00_ContentPlaceHolder1_lbltot_work_verified',
        'households_verified': 'ctl00_ContentPlaceHolder1_lbltot_hh_verified',
        'gram_sabha_participants': 'ctl00_ContentPlaceHolder1_lblno_of_ppl_participated_gs'
    }
    
    # Extract work fields
    for key, id_value in work_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None
    
    # Extract expenses
    expense_fields = {
        'printing_expense': 'ctl00_ContentPlaceHolder1_lblprinting_expense',
        'videography_expense': 'ctl00_ContentPlaceHolder1_lblvideography_expense',
        'tea_expense': 'ctl00_ContentPlaceHolder1_lbltea_expense',
        'vrp_training_expense': 'ctl00_ContentPlaceHolder1_lblvrp_training_expense',
        'vrp_travel_expense': 'ctl00_ContentPlaceHolder1_lblvrp_travel_expense',
        'photocopying_expense': 'ctl00_ContentPlaceHolder1_lblphotocopying_expense',
        'other_expense': 'ctl00_ContentPlaceHolder1_lblother_expense',
        'vrp_honorarium_expense': 'ctl00_ContentPlaceHolder1_lblvrp_honorium_expense',
        'stationary_expense': 'ctl00_ContentPlaceHolder1_lblstationary_expense',
        'publicity_expense': 'ctl00_ContentPlaceHolder1_lblpublicity_expense',
        'mic_expense': 'ctl00_ContentPlaceHolder1_lblmic_expense',
        'photography_expense': 'ctl00_ContentPlaceHolder1_lblphotography_expense',
        'shamiana_expense': 'ctl00_ContentPlaceHolder1_lblshamiana_expense',
        'total_expense': 'ctl00_ContentPlaceHolder1_lbltotal_expense'
    }
    
    # Extract expense fields
    for key, id_value in expense_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None
    
    # Extract checklist responses
    checklist_fields = {
        'job_cards_with_people': 'ctl00_ContentPlaceHolder1_Label1',
        'job_cards_updated': 'ctl00_ContentPlaceHolder1_Label3',
        'job_cards_renewed': 'ctl00_ContentPlaceHolder1_Label4',
        'demand_registration_process': 'ctl00_ContentPlaceHolder1_Label2',
        'unmet_demand': 'ctl00_ContentPlaceHolder1_Label29',
        'payment_agency_problems': 'ctl00_ContentPlaceHolder1_Label30'
    }
    
    # Extract checklist fields
    for key, id_value in checklist_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None
    
    # Add source file
    data['source_file'] = file_path.name
    
    return data

def process_html_folder(folder_path):
    """Process all HTML files in the given folder."""
    folder = Path(folder_path)
    html_files = list(folder.glob('*.html'))
    
    print(f"Found {len(html_files)} HTML files to process")
    results = []
    
    for file_path in tqdm(html_files, desc="Processing HTML files"):
        try:
            data = extract_info_from_html(file_path)
            results.append(data)
            
            # Save intermediate results every 100 files
            if len(results) % 100 == 0:
                pd.DataFrame(results).to_csv(f'intermediate_results_{len(results)}.csv', index=False)
                print(f"\nSaved intermediate results for {len(results)} files")
                
        except Exception as e:
            print(f"\nError processing {file_path}: {str(e)}")
    
    # Create final DataFrame
    df = pd.DataFrame(results)
    
    # Save final results
    output_file = 'final_audit_results.csv'
    df.to_csv(output_file, index=False)
    print(f"\nSaved final results to {output_file}")
    print(f"Total records processed: {len(df)}")
    
    return df

# Usage
if __name__ == "__main__":
    html_folder = "html"  # Replace with your folder path
    df = process_html_folder(html_folder)
    
    # Display summary
    print("\nDataFrame Summary:")
    print(df.info())

Found 527 HTML files to process


Processing HTML files:  19%|███▍              | 102/527 [00:05<00:23, 17.94it/s]


Saved intermediate results for 100 files


Processing HTML files:  39%|██████▉           | 203/527 [00:10<00:17, 18.20it/s]


Saved intermediate results for 200 files


Processing HTML files:  57%|██████████▎       | 303/527 [00:15<00:11, 18.77it/s]


Saved intermediate results for 300 files


Processing HTML files:  76%|█████████████▋    | 402/527 [00:20<00:07, 17.85it/s]


Saved intermediate results for 400 files


Processing HTML files:  95%|█████████████████▏| 503/527 [00:25<00:01, 19.20it/s]


Saved intermediate results for 500 files


Processing HTML files: 100%|██████████████████| 527/527 [00:26<00:00, 19.57it/s]



Saved final results to final_audit_results.csv
Total records processed: 527

DataFrame Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   state                        524 non-null    object
 1   district                     524 non-null    object
 2   block                        524 non-null    object
 3   panchayat                    524 non-null    object
 4   sa_start_date                524 non-null    object
 5   sa_end_date                  524 non-null    object
 6   gram_sabha_date              524 non-null    object
 7   public_hearing_date          524 non-null    object
 8   sa_period_from               524 non-null    object
 9   sa_period_to                 524 non-null    object
 10  wage_exp                     524 non-null    object
 11  material_exp                 524 non-null    object


In [2]:
df.head()

Unnamed: 0,state,district,block,panchayat,sa_start_date,sa_end_date,gram_sabha_date,public_hearing_date,sa_period_from,sa_period_to,...,photography_expense,shamiana_expense,total_expense,job_cards_with_people,job_cards_updated,job_cards_renewed,demand_registration_process,unmet_demand,payment_agency_problems,source_file
0,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Araku Valley,Chompi,15/12/2022,17/12/2022,17/12/2022,19/12/2022,01/04/2021,31/03/2022,...,0,0.0,4955.81,Greater than 75%,Yes,Yes,Yes,"Yes, Some Demand",No,02_0214_0203005_0203005002_2021-2022_12_17_202...
1,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Chintapalle,KUDUMUSARI,12/10/2023,15/10/2023,15/10/2023,21/10/2023,01/04/2022,31/03/2023,...,0,0.0,7733.0,Greater than 75%,No,Yes,Mostly,"Yes, Some Demand",No,02_0214_0203012_0203012003_2022-2023_10_15_202...
2,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Chintapalle,Lothugedda,13/03/2021,18/03/2021,19/03/2021,07/12/2020,01/04/2019,31/03/2020,...,0,294.11,3698.73,Greater than 75%,Yes,Yes,Some,"No, people get work when they want it",No,02_0214_0203012_0203012007_2019-2020_3_19_2021...
3,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Araku Valley,Madagada,31/12/2020,09/01/2021,09/01/2021,11/01/2021,01/04/2019,31/03/2020,...,0,0.0,4052.96,Greater than 75%,Yes,Yes,No,"No, people get work when they want it",No,02_0214_0203005_0203005009_2019-2020_1_9_2021_...
4,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Addateegala,THIMMAPURAM,29/08/2018,01/09/2018,02/09/2018,04/09/2018,01/04/2017,31/03/2018,...,0,0.0,6582.0,Greater than 75%,Yes,Yes,Yes,"Yes, Some Demand","Yes, Some Problems",02_0214_0204003_0204003019_2017-2018_9_2_2018_...
