In [21]:
import os
import gzip
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from bs4 import BeautifulSoup

In [19]:
def extract_info_from_html(file_path):
    """Extract all relevant information from a gzipped HTML file."""
    if file_path.name.endswith('.html.gz'):
        with gzip.open(file_path, 'rb') as f:
            html_bytes = f.read()
        html_content = html_bytes.decode('utf-8', errors='replace')
    else:
        # (This block won't be used in our case.)
        with open(file_path, 'r', encoding='utf-8') as f:
            html_content = f.read()
    
    soup = BeautifulSoup(html_content, 'html.parser')
    
    data = {}
    
    basic_fields = {
        'state': 'ctl00_ContentPlaceHolder1_lblstate',
        'district': 'ctl00_ContentPlaceHolder1_lbldistrict',
        'block': 'ctl00_ContentPlaceHolder1_lblblock',
        'panchayat': 'ctl00_ContentPlaceHolder1_lblpanchayat',
        'sa_start_date': 'ctl00_ContentPlaceHolder1_lblSA_start_dt',
        'sa_end_date': 'ctl00_ContentPlaceHolder1_lblSA_end_dt',
        'gram_sabha_date': 'ctl00_ContentPlaceHolder1_lblGramSabha_dt',
        'public_hearing_date': 'ctl00_ContentPlaceHolder1_lblPublic_Hearing_dt'
    }
    for key, id_value in basic_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Financial Information
    financial_fields = {
        'sa_period_from': 'ctl00_ContentPlaceHolder1_lblSA_Period_From_Date',
        'sa_period_to': 'ctl00_ContentPlaceHolder1_lblSA_Period_To_Date',
        'wage_exp': 'ctl00_ContentPlaceHolder1_lblWage_exp',
        'material_exp': 'ctl00_ContentPlaceHolder1_lblmat_exp',
        'total_exp': 'ctl00_ContentPlaceHolder1_lbltotal_expen',
        'wage_given': 'ctl00_ContentPlaceHolder1_lblwage_given',
        'material_given': 'ctl00_ContentPlaceHolder1_lblmat_given',
        'total_given': 'ctl00_ContentPlaceHolder1_lbltotal_record_given'
    }
    for key, id_value in financial_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Work Information
    work_fields = {
        'total_works': 'ctl00_ContentPlaceHolder1_lbltot_work',
        'total_households': 'ctl00_ContentPlaceHolder1_lbltot_hh',
        'works_verified': 'ctl00_ContentPlaceHolder1_lbltot_work_verified',
        'households_verified': 'ctl00_ContentPlaceHolder1_lbltot_hh_verified',
        'gram_sabha_participants': 'ctl00_ContentPlaceHolder1_lblno_of_ppl_participated_gs'
    }
    for key, id_value in work_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Expense Information
    expense_fields = {
        'printing_expense': 'ctl00_ContentPlaceHolder1_lblprinting_expense',
        'videography_expense': 'ctl00_ContentPlaceHolder1_lblvideography_expense',
        'tea_expense': 'ctl00_ContentPlaceHolder1_lbltea_expense',
        'vrp_training_expense': 'ctl00_ContentPlaceHolder1_lblvrp_training_expense',
        'vrp_travel_expense': 'ctl00_ContentPlaceHolder1_lblvrp_travel_expense',
        'photocopying_expense': 'ctl00_ContentPlaceHolder1_lblphotocopying_expense',
        'other_expense': 'ctl00_ContentPlaceHolder1_lblother_expense',
        'vrp_honorarium_expense': 'ctl00_ContentPlaceHolder1_lblvrp_honorium_expense',
        'stationary_expense': 'ctl00_ContentPlaceHolder1_lblstationary_expense',
        'publicity_expense': 'ctl00_ContentPlaceHolder1_lblpublicity_expense',
        'mic_expense': 'ctl00_ContentPlaceHolder1_lblmic_expense',
        'photography_expense': 'ctl00_ContentPlaceHolder1_lblphotography_expense',
        'shamiana_expense': 'ctl00_ContentPlaceHolder1_lblshamiana_expense',
        'total_expense': 'ctl00_ContentPlaceHolder1_lbltotal_expense'
    }
    for key, id_value in expense_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    # Checklist Information
    checklist_fields = {
        'job_cards_with_people': 'ctl00_ContentPlaceHolder1_Label1',
        'job_cards_updated': 'ctl00_ContentPlaceHolder1_Label3',
        'job_cards_renewed': 'ctl00_ContentPlaceHolder1_Label4',
        'demand_registration_process': 'ctl00_ContentPlaceHolder1_Label2',
        'unmet_demand': 'ctl00_ContentPlaceHolder1_Label29',
        'payment_agency_problems': 'ctl00_ContentPlaceHolder1_Label30'
    }
    for key, id_value in checklist_fields.items():
        element = soup.find('span', id=id_value)
        data[key] = element.text.strip() if element else None

    data['source_file'] = file_path.name

    return data

def process_html_folder(folder_path):
    """
    Process all gzipped HTML files (.html.gz) in the given folder and return a DataFrame.
    """
    folder = Path(folder_path)
    html_files = list(folder.glob('*.html.gz'))
    
    print(f"Found {len(html_files)} HTML.gz files to process")
    results = []
    
    for file_path in tqdm(html_files, desc="Processing HTML.gz files"):
        try:
            data = extract_info_from_html(file_path)
            results.append(data)
        except Exception as e:
            print(f"\nError processing {file_path}: {str(e)}")
    
    df = pd.DataFrame(results)
    
    output_file = '../data/final_audit_results.csv'
    df.to_csv(output_file, index=False)
    print(f"\nSaved final results to {output_file}")
    print(f"Total records processed: {len(df)}")
    
    return df

In [20]:
html_folder = "../data/html"
df = process_html_folder(html_folder)

print(df.info())

Found 5746 HTML.gz files to process


Processing HTML.gz files: 100%|█████████████| 5746/5746 [05:11<00:00, 18.43it/s]



Saved final results to final_audit_results.csv
Total records processed: 5746
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5746 entries, 0 to 5745
Data columns (total 42 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   state                        5735 non-null   object
 1   district                     5735 non-null   object
 2   block                        5735 non-null   object
 3   panchayat                    5735 non-null   object
 4   sa_start_date                5735 non-null   object
 5   sa_end_date                  5735 non-null   object
 6   gram_sabha_date              5735 non-null   object
 7   public_hearing_date          5735 non-null   object
 8   sa_period_from               5735 non-null   object
 9   sa_period_to                 5735 non-null   object
 10  wage_exp                     5735 non-null   object
 11  material_exp                 5735 non-null   object
 12  total_exp   

In [22]:
df.head()

Unnamed: 0,state,district,block,panchayat,sa_start_date,sa_end_date,gram_sabha_date,public_hearing_date,sa_period_from,sa_period_to,...,photography_expense,shamiana_expense,total_expense,job_cards_with_people,job_cards_updated,job_cards_renewed,demand_registration_process,unmet_demand,payment_agency_problems,source_file
0,ANDHRA PRADESH,ANAKAPALLI,Nakkapalli,Pedateenarla,28/09/2021,02/10/2021,02/10/2021,08/10/2021,01/04/2019,31/03/2020,...,0,15.0,1818.0,Greater than 75%,Yes,Yes,Yes,"No, people get work when they want it",No,02_0215_0203039_0203039031_2019-2020_10_2_2021...
1,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Ananthagiri,Pinakota,11/06/2022,16/06/2022,16/06/2022,25/06/2022,01/04/2020,31/03/2021,...,0,229.16,6618.92,Between 50% and 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,02_0214_0203006_0203006022_2020-2021_6_16_2022...
2,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Dumbriguda,Lygonda,28/10/2022,31/10/2022,31/10/2022,05/11/2022,01/09/2022,10/09/2022,...,0,0.0,0.0,Greater than 75%,Yes,Yes,Mostly,"Yes, Some Demand",No,02_0214_0203004_0203004017_2021-2022_10_31_202...
3,ANDHRA PRADESH,ALLURI SITHARAMA RAJU,Addateegala,DORAMAMIDI,15/05/2022,22/05/2022,23/05/2022,26/05/2022,01/04/2020,31/03/2021,...,0,229.16,6618.92,Between 50% and 75%,Yes,Yes,No,"Yes, Huge Demand",No,02_0214_0204003_0204003010_2020-2021_5_23_2022...
4,ANDHRA PRADESH,ANAKAPALLI,Madugula,Avuruvada,11/07/2022,18/07/2022,18/07/2022,03/09/2022,01/04/2019,31/03/2020,...,0,33.0,2231.0,Greater than 75%,Yes,Yes,Yes,"Yes, Some Demand",No,02_0215_0203009_0203009027_2019-2020_7_18_2022...


In [23]:
df[df['state']=='RAJASTHAN'].shape

(222, 42)

In [32]:
(pd.to_numeric(df['total_works']) - pd.to_numeric(df['works_verified'])).describe()

count    5735.000000
mean       23.531125
std        65.484617
min      -168.000000
25%         0.000000
50%         0.000000
75%        11.000000
max      1021.000000
dtype: float64

In [36]:
(pd.to_numeric(df['total_households']) - pd.to_numeric(df['households_verified'])).describe()

count    5735.000000
mean       55.769834
std       188.733926
min      -655.000000
25%         0.000000
50%         8.000000
75%        23.000000
max      3116.000000
dtype: float64