In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.alert import Alert
import time
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import json

## Declare the Node Class to Store Data

In [None]:
class Node:
    def __init__(self, name, level = 'Root', parent=None):
        self.name = name
        self.parent = parent
        self.properties = {}
        self.children = {}
        self.completed = False
        self.level = level
        self.remrks = None

    def mark_complete(self):
        self.completed = True

    def add_child(self, child_node):
        self.children[child_node.name] = child_node

    def find_child_by_name(self, name):
        return self.children.get(name, None)
    
    def to_dict(self):
        node_dict = {
            "name": self.name,
            "level": self.level,
            "properties": self.properties,
            "completed": self.completed,
            "remarks": self.remrks
        }
        if self.children:
            node_dict["children"] = [child_node.to_dict() for child_node in self.children.values()]
        return node_dict
  
    def __str__(self):
        return self.name

## Set WebDriver to Fetch All District Info (First Level) 

In [None]:


# Set up the WebDriver (make sure to specify your path if needed)
driver = webdriver.ChromiumEdge()  # or webdriver.Firefox()
try:
    # Open the main page
    driver.get("https://upbhulekh.gov.in/")

    # Wait for the page to load
    time.sleep(5)

    # Handle any alerts if present
    try:
        Alert(driver).accept()
    except:
        pass


    # Wait for the specific element to be clickable
    real_time_link = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.XPATH, "//div[@class='feature-content']/h3[text()='रियल टाइम खतौनी की नक़ल देखे']"))
    )

    # Click the element
    real_time_link.click()

    time.sleep(2)

    # Get the captcha text from the hidden input
    captcha_text = driver.find_element(By.ID, "txtCaptcha1").get_attribute("value")
    print(f"Captcha text: {captcha_text}")

    # Enter the captcha into the input field
    captcha_input = driver.find_element(By.ID, "Captcha1")
    captcha_input.send_keys(captcha_text)

    submitButton = driver.find_element(By.ID,  "btnValid")
    submitButton.submit()

except Exception as e:
    print(f'Error - {e}')

In [None]:

#Delcare the Main Node
mainRoot = Node('MainRoot')
districts = driver.find_elements(By.XPATH, "//ul[@id='district']/li/a")

# Extract and print district names and their data-value
for district in districts:
    district_name = district.find_element(By.CLASS_NAME, "text").text
    district_value = district.get_attribute("data-value")
    print(f"District: {district_name}, Data-Value: {district_value}")
    distNode = Node(district_name, 'district')
    distNode.properties['id'] = district_value
    mainRoot.add_child(distNode)

### Start Processing Teshil Level / District (Second Level)
#### Start Processing Village Level / Teshil / District (Third Level) - Add All Village Data to respected parent node

In [None]:
# Extract and print Teshil names and their data-value
try:
    for district in districts:
        district_name = district.find_element(By.CLASS_NAME, "text").text
        parentdisTrictRoot = mainRoot.find_child_by_name(district_name)
        district.click()
        time.sleep(2)
        teshil = driver.find_elements(By.XPATH, "//ul[@id='tehsil']/li/a")
        for th in teshil:
            teshil_name = th.find_element(By.CLASS_NAME, "text").text
            teshil_value = th.get_attribute("data-value")
            print(f"Teshil: {teshil_name}, Data-Value: {teshil_value}")
            teshNode = Node(teshil_name, 'teshil')
            teshNode.properties['id'] = teshil_value
            teshNode.parent = district_name
            parentdisTrictRoot.add_child(teshNode)

            #now start the process of village level 
            th.click()
            time.sleep(3)
            villages = driver.find_elements(By.XPATH, "//ul[@id='village']/li/a")
            #now scrap all village data 
            for village in villages:
                village_name = village.find_element(By.CLASS_NAME, "text").text
                village_value = village.find_elements(By.CLASS_NAME, "text")[1].text
                print(f"Village: {village_name}, Data-Value: {village_value}")
                villNode = Node(village_name, 'village')
                villNode.properties['id'] = village_value
                villNode.parent = teshil_name
                teshNode.add_child(villNode)
                villNode.mark_complete()
            teshNode.mark_complete()
        parentdisTrictRoot.mark_complete()
except Exception as e:
    print(f'Error Occured During Whole Process {e}')

#### Convert Mainroot to .json and .json to DF 

In [None]:
def processRoot(root, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        # Initialize an empty list to hold JSON objects
        json_list = []
        
        # Iterate over each district in root.children
        for dist in root.children:
            # Convert the district data to dictionary
            district_data = root.children[dist].to_dict()
            
            # Append district_data to json_list
            json_list.append(district_data)
        
        # Write the entire json_list to file as a JSON array
        json.dump(json_list, f, ensure_ascii=False, indent=4)

In [None]:
processRoot(mainRoot, 'UP Scrap Test.json')

In [None]:
def jsonToDf(jsonfilepath='checkpoint.json'):
    with open(jsonfilepath, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Flatten JSON and create DataFrame
    df = pd.DataFrame([
        (dist['name'], dist['properties']['id'], tehsil['name'], tehsil['properties']['id'], 
         village['name'] if tehsil.get('children') else "No Data Found", 
         village['properties']['id'] if tehsil.get('children') else "No Data Found")
        for dist in data
        for tehsil in dist.get('children', [])
        for village in (tehsil['children'] if tehsil.get('children') else [{"name": "No Data Found", "properties": {"id": "No Data Found"}}])
    ], columns=['district', 'district_id', 'tehsil', 'tehsil_id', 'village', 'village_id'])

    return df

In [None]:
resultUp  = jsonToDf('UP Scrap 1stWebsite.json')
resultUp.to_csv('UP_Scrap_1stWebsite.csv', encoding='utf-8', index=False)