In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
from selenium.common.exceptions import NoSuchElementException, TimeoutException, WebDriverException, UnexpectedAlertPresentException
import time
import pandas as pd
import json
from collections import defaultdict
import logging

## Declare the Node Class to Store Data

In [None]:
class Node:
    def __init__(self, name, level = 'Root', parent=None):
        self.name = name
        self.parent = parent
        self.properties = {}
        self.children = {}
        self.completed = False
        self.level = level
        self.remrks = None

    def mark_complete(self):
        self.completed = True

    def add_child(self, child_node):
        self.children[child_node.name] = child_node

    def find_child_by_name(self, name):
        return self.children.get(name, None)
    
    def to_dict(self):
        node_dict = {
            "name": self.name,
            "level": self.level,
            "properties": self.properties,
            "completed": self.completed,
            "remarks": self.remrks
        }
        if self.children:
            node_dict["children"] = [child_node.to_dict() for child_node in self.children.values()]
        return node_dict
  
    def __str__(self):
        return self.name

In [None]:
#Configure logging
logging.basicConfig(filename=f'Scrapping_UP.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', encoding='utf-8')

## Set WebDriver to Fetch All District Info (First Level)

In [None]:
# Set up the WebDriver (make sure to specify your path if needed)
driver = webdriver.ChromiumEdge()  # or webdriver.Firefox()
try:
    # Open the main page
    driver.get("https://upbhulekh.gov.in/GeoDashboard/public_report/")

    # Wait for the page to load
    time.sleep(5)

    # Handle any alerts if present
    try:
        Alert(driver).accept()
        logging.info("Alert accepted.")
    except UnexpectedAlertPresentException:
        logging.warning("No alert to accept.")
    except Exception as e:
        logging.error(f"Error while handling alert: {e}")

    # Wait for the specific element to be clickable
    # Find the table rows
    rows = driver.find_elements(By.CSS_SELECTOR, 'table tbody tr')

    # Extract district names and distCodes
    districts = []

    for row in rows:
        try:
            cells = row.find_elements(By.TAG_NAME, 'td')
            if cells:
                district_name = cells[1].text  # the district is in the second column
                dist_code_link = cells[1].find_element(By.TAG_NAME, 'a')
                dist_code = dist_code_link.get_attribute('href').split('=')[-1]
                districts.append((district_name, dist_code))
        except NoSuchElementException as e:
            logging.error(f'Error finding elements in row: {e}')
        except Exception as e:
            logging.error(f'Unexpected error processing row: {e}')
except WebDriverException as e:
    logging.critical(f'WebDriver error: {e}')
except Exception as e:
    logging.critical(f'Unexpected error during setup: {e}')

### Delcare the Main Node -- Start Processing Teshil Level / District (Second Level)

In [None]:
mainRoot = Node('MainRoot')
teshEntry = defaultdict(list)
try:
    for dist in districts:
        distNode = Node(dist[0], 'district')
        distNode.properties['id'] = dist[1]
        mainRoot.add_child(distNode)
        logging.info(f'Start Processing {dist[0]} for Teshil Info {districts.index((dist[0], dist[1])) + 1} / {len(districts)}')

        try:
            driver.get(f'https://upbhulekh.gov.in/GeoDashboard/public_report/?distCode={dist[1]}')
            time.sleep(2)
            rowTesh = driver.find_elements(By.CSS_SELECTOR, 'table tbody tr')
            if not rowTesh:
                logging.warning(f'No Teshil rows found for district {dist[0]}, Code ({dist[1]})')
            teshCounter = 1
            for rT in rowTesh:
                try:
                    tCell = rT.find_elements(By.TAG_NAME, 'td')
                    if tCell:
                        tesh_name = tCell[2].text  # Assuming the district is in the second column
                        tesh_code_link = tCell[2].find_element(By.TAG_NAME, 'a')
                        tesh_code = tesh_code_link.get_attribute('href').split('=')[-1]
                        logging.info(f'District {dist[0]} --> Found Teshil: {tesh_name}, Code: {tesh_code}  --> {teshCounter} / {len(rowTesh)-1}')
                        teshCounter += 1
                        teshNode = Node(tesh_name, 'teshil')
                        teshNode.properties['id'] = tesh_code
                        teshNode.parent = dist[0]
                        distNode.add_child(teshNode)
                        teshEntry[(dist[0], dist[1])].append((tesh_code, tesh_name))
                except NoSuchElementException as e:
                    logging.error(f'Teshil info not found in row for district {dist[0]}: {e}')
                except Exception as e:
                    logging.error(f'Error processing row for district {dist[0]}: {e}')
            if teshCounter >= len(rowTesh):
                logging.info(f'Process Completed for District {dist[0]} at Teshil Level')
            else:
                logging.warning(f'Process not Completed for District {dist[0]} at Teshil Level -- Some issue is present for particular district')
        except TimeoutException as e:
            logging.error(f'Timeout while accessing Teshil for district {dist[0]}, Code ({dist[1]}): {e}')
        except Exception as e:
            logging.error(f'Error occurred during Teshil info retrieval for district {dist[0]}: {e}')
except Exception as e:
    logging.critical(f'Error Occured During Teshil Process {e}')        

### Start Processing Village Level / Teshil / District (Third Level) - Add All Village Data to respected parent node

In [None]:
try:
    for key in teshEntry:
        distMainNode = mainRoot.find_child_by_name(key[0])
        logging.info(f'Start Processing District - {key[0]} for Vill Info')
        for teshilCode in teshEntry[key]:
            try:
                #find district node and tesh node
                teshMainNode = distMainNode.find_child_by_name(teshilCode[1])
                driver.get(f'https://upbhulekh.gov.in/GeoDashboard/public_report/?distCode={key[1]}&tehsilCode={teshilCode[0]}')
                time.sleep(2)
                rowVill = driver.find_elements(By.CSS_SELECTOR, 'table tbody tr')
                if not rowVill:
                    logging.warning((f'No villages found for Tehsil {teshilCode[1]} in District {key[0]}.'))
                villcounter = 1
                for vill in rowVill:
                    try:
                        villCell = vill.find_elements(By.TAG_NAME, 'td')
                        if villCell:
                            vill_name = villCell[3].text 
                            vill_code = villCell[4].text
                            villNode = Node(vill_name, 'village')
                            villNode.properties['id'] = vill_code
                            villNode.parent = tesh_name
                            villNode.mark_complete()
                            teshMainNode.add_child(villNode)
                            logging.info(f'District {key[0]} --> Teshil: {teshilCode[1]} --> Village: {vill_name} Vill Code: {vill_code}  --> {villcounter} / {len(rowVill)-1}')
                            villcounter += 1
                    except NoSuchElementException as e:
                        logging.error(f'Element not found in village row for Tehsil {teshilCode[1]}: {e}')
                    except Exception as e:
                        logging.error(f'Error processing village row for Tehsil {teshilCode[1]}: {e}')
                teshMainNode.mark_complete()
                if villcounter >= len(rowVill):
                    logging.info(f'Process Completed for District {key[0]} at Teshil Level {teshilCode[1]}')
                else:
                    logging.warning(f'Process not Completed for District {key[0]} at Teshil Level {teshilCode[1]} -- Some issue is present for particular district')
            except TimeoutException as e:
                logging.error(f'Timeout while accessing Tehsil {teshilCode[1]}: {e}')
            except Exception as e:
                logging.error(f'Error occurred during retrieval for Tehsil {teshilCode[1]}: {e}')
        distMainNode.mark_complete()
except Exception as e:
    logging.critical(f'Error Occured During Village Process {e}')

#### Convert Mainroot to .json and .json to DF 

In [None]:
try:
    def processRoot(root, filename):
        with open(filename, 'w', encoding='utf-8') as f:
            # Initialize an empty list to hold JSON objects
            json_list = []
            
            # Iterate over each district in root.children
            for dist in root.children:
                # Convert the district data to dictionary
                district_data = root.children[dist].to_dict()
                
                # Append district_data to json_list
                json_list.append(district_data)
            
            # Write the entire json_list to file as a JSON array
            json.dump(json_list, f, ensure_ascii=False, indent=4)
except Exception as e:
    print(f'Error Occured During node to .json convert {e}')

In [None]:
processRoot(mainRoot, 'UP Scrap Test 2.json')

In [None]:
try:
    def jsonToDf(jsonfilepath='checkpoint.json'):
        with open(jsonfilepath, 'r', encoding='utf-8') as f:
            data = json.load(f)

        # Flatten JSON and create DataFrame
        df = pd.DataFrame([
            (dist['name'], dist['properties']['id'], tehsil['name'], tehsil['properties']['id'], 
            village['name'] if tehsil.get('children') else "No Data Found", 
            village['properties']['id'] if tehsil.get('children') else "No Data Found")
            for dist in data
            for tehsil in dist.get('children', [])
            for village in (tehsil['children'] if tehsil.get('children') else [{"name": "No Data Found", "properties": {"id": "No Data Found"}}])
        ], columns=['district', 'district_id', 'tehsil', 'tehsil_id', 'village', 'village_id'])

        return df
except Exception as e:
    print(f'Error Occured During .json to df convert {e}')

In [None]:
resultUp  = jsonToDf('UP Scrap 2ndWebsite.json')
resultUp.to_csv('UP_Scrap_2ndWebsite.csv', encoding='utf-8', index=False)