In [1]:
import os
import gzip
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from bs4 import BeautifulSoup

In [2]:
def extract_info_from_html(file_path):
    """Extract all relevant information from a gzipped HTML file."""
    if file_path.name.endswith('.html.gz'):
        with gzip.open(file_path, 'rb') as f:
            html_bytes = f.read()
        html_content = html_bytes.decode('utf-8', errors='replace')
    else:
        # (This block won't be used in our case.)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    data = {}
    
    basic_fields = {
        'state': 'ctl00_ContentPlaceHolder1_lblstate',
        'district': 'ctl00_ContentPlaceHolder1_lbldistrict',
        'block': 'ctl00_ContentPlaceHolder1_lblblock',
        'panchayat': 'ctl00_ContentPlaceHolder1_lblpanchayat',
        'sa_start_date': 'ctl00_ContentPlaceHolder1_lblSA_start_dt',
        'sa_end_date': 'ctl00_ContentPlaceHolder1_lblSA_end_dt',
        'gram_sabha_date': 'ctl00_ContentPlaceHolder1_lblGramSabha_dt',
        'public_hearing_date': 'ctl00_ContentPlaceHolder1_lblPublic_Hearing_dt'
    }
    for key, id_value in basic_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Financial Information
    financial_fields = {
        'sa_period_from': 'ctl00_ContentPlaceHolder1_lblSA_Period_From_Date',
        'sa_period_to': 'ctl00_ContentPlaceHolder1_lblSA_Period_To_Date',
        'wage_exp': 'ctl00_ContentPlaceHolder1_lblWage_exp',
        'material_exp': 'ctl00_ContentPlaceHolder1_lblmat_exp',
        'total_exp': 'ctl00_ContentPlaceHolder1_lbltotal_expen',
        'wage_given': 'ctl00_ContentPlaceHolder1_lblwage_given',
        'material_given': 'ctl00_ContentPlaceHolder1_lblmat_given',
        'total_given': 'ctl00_ContentPlaceHolder1_lbltotal_record_given'
    }
    for key, id_value in financial_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Work Information
    work_fields = {
        'total_works': 'ctl00_ContentPlaceHolder1_lbltot_work',
        'total_households': 'ctl00_ContentPlaceHolder1_lbltot_hh',
        'works_verified': 'ctl00_ContentPlaceHolder1_lbltot_work_verified',
        'households_verified': 'ctl00_ContentPlaceHolder1_lbltot_hh_verified',
        'gram_sabha_participants': 'ctl00_ContentPlaceHolder1_lblno_of_ppl_participated_gs'
    }
    for key, id_value in work_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Expense Information
    expense_fields = {
        'printing_expense': 'ctl00_ContentPlaceHolder1_lblprinting_expense',
        'videography_expense': 'ctl00_ContentPlaceHolder1_lblvideography_expense',
        'tea_expense': 'ctl00_ContentPlaceHolder1_lbltea_expense',
        'vrp_training_expense': 'ctl00_ContentPlaceHolder1_lblvrp_training_expense',
        'vrp_travel_expense': 'ctl00_ContentPlaceHolder1_lblvrp_travel_expense',
        'photocopying_expense': 'ctl00_ContentPlaceHolder1_lblphotocopying_expense',
        'other_expense': 'ctl00_ContentPlaceHolder1_lblother_expense',
        'vrp_honorarium_expense': 'ctl00_ContentPlaceHolder1_lblvrp_honorium_expense',
        'stationary_expense': 'ctl00_ContentPlaceHolder1_lblstationary_expense',
        'publicity_expense': 'ctl00_ContentPlaceHolder1_lblpublicity_expense',
        'mic_expense': 'ctl00_ContentPlaceHolder1_lblmic_expense',
        'photography_expense': 'ctl00_ContentPlaceHolder1_lblphotography_expense',
        'shamiana_expense': 'ctl00_ContentPlaceHolder1_lblshamiana_expense',
        'total_expense': 'ctl00_ContentPlaceHolder1_lbltotal_expense'
    }
    for key, id_value in expense_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Checklist Information
    checklist_fields = {
        'job_cards_with_people': 'ctl00_ContentPlaceHolder1_Label1',
        'job_cards_updated': 'ctl00_ContentPlaceHolder1_Label3',
        'job_cards_renewed': 'ctl00_ContentPlaceHolder1_Label4',
        'demand_registration_process': 'ctl00_ContentPlaceHolder1_Label2',
        'unmet_demand': 'ctl00_ContentPlaceHolder1_Label29',
        'payment_agency_problems': 'ctl00_ContentPlaceHolder1_Label30'
    }
    for key, id_value in checklist_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    data['source_file'] = file_path.name

    return data

def process_html_folder(folder_path):
    """
    Process all gzipped HTML files (.html.gz) in the given folder and return a DataFrame.
    """
    folder = Path(folder_path)
    html_files = list(folder.glob('*.html.gz'))
    
    print(f"Found {len(html_files)} HTML.gz files to process")
    results = []
    
    for file_path in tqdm(html_files, desc="Processing HTML.gz files"):
        try:
            data = extract_info_from_html(file_path)
            results.append(data)
        except Exception as e:
            print(f"\nError processing {file_path}: {str(e)}")
    
    df = pd.DataFrame(results)
    
    output_file = '../data/final_audit_results.csv'
    df.to_csv(output_file, index=False)
    print(f"\nSaved final results to {output_file}")
    print(f"Total records processed: {len(df)}")
    
    return df

In [3]:
html_folder = "../data/html"
df = process_html_folder(html_folder)

print(df.info())

Found 17535 HTML.gz files to process


Processing HTML.gz files: 100%|███████████| 17535/17535 [15:31<00:00, 18.83it/s]



Saved final results to ../data/final_audit_results.csv
Total records processed: 17535
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17535 entries, 0 to 17534
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   state                        17445 non-null  object
 1   district                     17445 non-null  object
 2   block                        17445 non-null  object
 3   panchayat                    17445 non-null  object
 4   sa_start_date                17445 non-null  object
 5   sa_end_date                  17445 non-null  object
 6   gram_sabha_date              17445 non-null  object
 7   public_hearing_date          17445 non-null  object
 8   sa_period_from               17445 non-null  object
 9   sa_period_to                 17445 non-null  object
 10  wage_exp                     17445 non-null  object
 11  material_exp                 17445 non-null  object
 12  t

In [4]:
df.head()

Unnamed: 0,state,district,block,panchayat,sa_start_date,sa_end_date,gram_sabha_date,public_hearing_date,sa_period_from,sa_period_to,...,photography_expense,shamiana_expense,total_expense,job_cards_with_people,job_cards_updated,job_cards_renewed,demand_registration_process,unmet_demand,payment_agency_problems,source_file
0,RAJASTHAN,BANSWARA,SAJJANGARH,खुन्दनी हाला,11/09/2023,16/09/2023,21/09/2023,21/09/2023,01/04/2022,31/03/2023,...,0,0,300,Greater than 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,27_2728_2728007_2728007279_2022-2023_9_21_2023...
1,ANDHRA PRADESH,ANAKAPALLI,Nakkapalli,Pedateenarla,28/09/2021,02/10/2021,02/10/2021,08/10/2021,01/04/2019,31/03/2020,...,0,15,1818,Greater than 75%,Yes,Yes,Yes,"No, people get work when they want it",No,02_0215_0203039_0203039031_2019-2020_10_2_2021...
2,RAJASTHAN,ALWAR,NEEMRANA,रोडवाल,09/08/2024,14/08/2024,16/08/2024,16/08/2024,01/04/2023,31/03/2024,...,0,0,0,Between 50% and 75%,Yes,Yes,Some,"Yes, Some Demand",No,27_2706_2706008_2706008272_2023-2024_8_16_2024...
3,RAJASTHAN,BARAN,CHHABARA,कोटडापार,14/09/2024,19/09/2024,20/09/2024,20/09/2024,01/04/2023,31/03/2024,...,0,0,3000,Between 50% and 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,27_2731_2731006_2731006164_2023-2024_9_20_2024...
4,RAJASTHAN,BARMER,BARMER,बलाऊ,01/03/2020,05/03/2020,05/03/2020,05/03/2020,01/04/2019,30/09/2019,...,0,0,1125,Greater than 75%,Yes,Yes,Yes,"No, people get work when they want it",No,27_2707_2717002_2717002120_2019-2020_3_5_2020_...


In [5]:
df.to_csv("../final_audit_results.csv", index = False)