In [1]:
!pip install requests beautifulsoup4 selenium webdriver-manager

Collecting selenium
  Using cached selenium-4.33.0-py3-none-any.whl.metadata (7.5 kB)
Collecting webdriver-manager
  Using cached webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Using cached urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting trio~=0.30.0 (from selenium)
  Using cached trio-0.30.0-py3-none-any.whl.metadata (8.5 kB)
Collecting trio-websocket~=0.12.2 (from selenium)
  Using cached trio_websocket-0.12.2-py3-none-any.whl.metadata (5.1 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Collecting typing_extensions~=4.13.2 (from selenium)
  Using cached typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting attrs>=23.2.0 (from trio~=0.30.0->selenium)
  Using cached attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.30.0->selenium)
  Using cached outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (

In [2]:
import requests
from bs4 import BeautifulSoup
import time
import json
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import csv
import pandas as pd

In [3]:
class OdishaRERAScraper:
    def __init__(self):
        self.base_url = "https://rera.odisha.gov.in"
        self.projects_url = "https://rera.odisha.gov.in/projects/project-list"
        self.session = requests.Session()
        self.driver = None
        
    def setup_driver(self):
        """Setup Chrome WebDriver with options"""
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in background
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        
        service = Service(ChromeDriverManager().install())
        self.driver = webdriver.Chrome(service=service, options=chrome_options)
        return self.driver
    
    def get_project_list(self):
        """Get the list of first 6 projects from the main page"""
        try:
            self.setup_driver()
            self.driver.get(self.projects_url)
            
            # Wait for the page to load
            wait = WebDriverWait(self.driver, 20)
            
            # Look for project table or list
            try:
                # Wait for table to load (adjust selector based on actual HTML structure)
                table = wait.until(EC.presence_of_element_located((By.TAG_NAME, "table")))
                
                # Find all "View Details" links or project rows
                view_details_links = self.driver.find_elements(By.XPATH, "//a[contains(text(), 'View Details')]")
                
                projects = []
                for i, link in enumerate(view_details_links[:6]):  # Get first 6 projects
                    try:
                        project_url = link.get_attribute('href')
                        # Get RERA registration number from the same row
                        row = link.find_element(By.XPATH, "./ancestor::tr")
                        cells = row.find_elements(By.TAG_NAME, "td")
                        
                        rera_no = cells[1].text.strip() if len(cells) > 1 else "N/A"
                        project_name = cells[2].text.strip() if len(cells) > 2 else "N/A"
                        
                        projects.append({
                            'rera_no': rera_no,
                            'project_name': project_name,
                            'detail_url': project_url
                        })
                        
                    except Exception as e:
                        print(f"Error extracting project {i+1}: {str(e)}")
                        continue
                
                return projects
                
            except Exception as e:
                print(f"Error finding project table: {str(e)}")
                # Try alternative approach - look for any links containing project details
                page_source = self.driver.page_source
                soup = BeautifulSoup(page_source, 'html.parser')
                
                # Look for project links (adjust based on actual HTML structure)
                project_links = soup.find_all('a', href=True)
                projects = []
                
                for link in project_links[:6]:
                    if 'project' in link.get('href', '').lower():
                        projects.append({
                            'rera_no': 'N/A',
                            'project_name': link.text.strip(),
                            'detail_url': self.base_url + link['href'] if not link['href'].startswith('http') else link['href']
                        })
                
                return projects
                
        except Exception as e:
            print(f"Error getting project list: {str(e)}")
            return []
    
    def get_project_details(self, project_url):
        """Extract detailed information from project detail page"""
        try:
            self.driver.get(project_url)
            wait = WebDriverWait(self.driver, 15)
            
            # Wait for page to load
            time.sleep(3)
            
            details = {}
            
            # Try to find and click "Promoter Details" tab
            try:
                promoter_tab = self.driver.find_element(By.XPATH, "//a[contains(text(), 'Promoter Details') or contains(text(), 'Promoter')]")
                promoter_tab.click()
                time.sleep(2)
            except:
                print("Promoter Details tab not found, continuing...")
            
            # Get page source and parse with BeautifulSoup
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Extract promoter company name
            company_elements = soup.find_all(text=lambda text: text and 'company' in text.lower())
            details['promoter_name'] = "N/A"
            for elem in company_elements:
                parent = elem.parent
                if parent and parent.next_sibling:
                    details['promoter_name'] = parent.next_sibling.strip()
                    break
            
            # Extract registered office address
            address_keywords = ['registered office', 'address', 'office address']
            details['promoter_address'] = "N/A"
            
            for keyword in address_keywords:
                address_elements = soup.find_all(text=lambda text: text and keyword in text.lower())
                for elem in address_elements:
                    parent = elem.parent
                    if parent and parent.next_sibling:
                        details['promoter_address'] = parent.next_sibling.strip()
                        break
                if details['promoter_address'] != "N/A":
                    break
            
            # Extract GST number
            gst_elements = soup.find_all(text=lambda text: text and 'gst' in text.lower())
            details['gst_no'] = "N/A"
            for elem in gst_elements:
                parent = elem.parent
                if parent and parent.next_sibling:
                    gst_text = parent.next_sibling.strip()
                    if len(gst_text) == 15 and gst_text.isalnum():  # GST format check
                        details['gst_no'] = gst_text
                        break
            
            # Alternative approach: look for table data
            tables = soup.find_all('table')
            for table in tables:
                rows = table.find_all('tr')
                for row in rows:
                    cells = row.find_all(['td', 'th'])
                    if len(cells) >= 2:
                        label = cells[0].text.strip().lower()
                        value = cells[1].text.strip()
                        
                        if 'company' in label or 'promoter' in label:
                            details['promoter_name'] = value
                        elif 'address' in label and 'office' in label:
                            details['promoter_address'] = value
                        elif 'gst' in label:
                            details['gst_no'] = value
            
            return details
            
        except Exception as e:
            print(f"Error getting project details from {project_url}: {str(e)}")
            return {
                'promoter_name': 'N/A',
                'promoter_address': 'N/A',
                'gst_no': 'N/A'
            }
    
    def scrape_projects(self):
        """Main method to scrape all project data"""
        try:
            print("Starting to scrape Odisha RERA projects...")
            
            # Get list of projects
            projects = self.get_project_list()
            print(f"Found {len(projects)} projects")
            
            if not projects:
                print("No projects found. The website structure might have changed.")
                return []
            
            # Get detailed information for each project
            detailed_projects = []
            
            for i, project in enumerate(projects, 1):
                print(f"Processing project {i}/{len(projects)}: {project['project_name']}")
                
                details = self.get_project_details(project['detail_url'])
                
                complete_project = {
                    'rera_regd_no': project['rera_no'],
                    'project_name': project['project_name'],
                    'promoter_name': details['promoter_name'],
                    'promoter_address': details['promoter_address'],
                    'gst_no': details['gst_no']
                }
                
                detailed_projects.append(complete_project)
                
                # Add delay to be respectful to the server
                time.sleep(2)
            
            return detailed_projects
            
        except Exception as e:
            print(f"Error in scraping process: {str(e)}")
            return []
        
        finally:
            if self.driver:
                self.driver.quit()

In [4]:
scraper = OdishaRERAScraper()
projects = scraper.scrape_projects()

Starting to scrape Odisha RERA projects...
Error finding project table: Message: 
Stacktrace:
	GetHandleVerifier [0x003CFC03+61635]
	GetHandleVerifier [0x003CFC44+61700]
	(No symbol) [0x001F05D3]
	(No symbol) [0x0023899E]
	(No symbol) [0x00238D3B]
	(No symbol) [0x00280E12]
	(No symbol) [0x0025D2E4]
	(No symbol) [0x0027E61B]
	(No symbol) [0x0025D096]
	(No symbol) [0x0022C840]
	(No symbol) [0x0022D6A4]
	GetHandleVerifier [0x00654523+2701795]
	GetHandleVerifier [0x0064FCA6+2683238]
	GetHandleVerifier [0x0066A9EE+2793134]
	GetHandleVerifier [0x003E68C5+155013]
	GetHandleVerifier [0x003ECFAD+181357]
	GetHandleVerifier [0x003D7458+92440]
	GetHandleVerifier [0x003D7600+92864]
	GetHandleVerifier [0x003C1FF0+5296]
	BaseThreadInitThunk [0x758F7BA9+25]
	RtlInitializeExceptionChain [0x7767C0CB+107]
	RtlClearBits [0x7767C04F+191]

Found 0 projects
No projects found. The website structure might have changed.


In [5]:
if projects:
    print(f"\nSuccessfully scraped {len(projects)} projects:")
    print("-" * 80)
    
    for i, project in enumerate(projects, 1):
        print(f"{i}. RERA No: {project['rera_regd_no']}")
        print(f"   Project Name: {project['project_name']}")
        print(f"   Promoter Name: {project['promoter_name']}")
        print(f"   Promoter Address: {project['promoter_address']}")
        print(f"   GST No: {project['gst_no']}")
        print("-" * 80)
    
    # Convert to DataFrame for better display
    df = pd.DataFrame(projects)
    display(df)
    
    # Save to CSV
    df.to_csv('rera_projects_jupyter.csv', index=False)
    print("\nData saved to 'rera_projects_jupyter.csv'")
else:
    print("No projects were scraped successfully.")

No projects were scraped successfully.
