## Scraping details from each property's webpage

The data obtained from the resultLists scraping is incomplete. However, it contains url for each property. We can use the url to scrape the details of each property.

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
import json
import os
import pandas as pd
import time

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "../Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv")

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [3]:
# Initialize WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)

with open('../Data/scraping_error_ids2.txt', 'r') as f:
    error_ids = f.read().splitlines()

try:
    for i in range(26000, 28000):
        try:
            # Get property details
            property_id = df.iloc[i]['id']
            property_url = df.iloc[i]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            time.sleep(1)

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

            # time.sleep(1)

        except Exception as e:
            error_ids.append(property_id)
            print(f"Error processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids)}\nError Ids: {error_ids}")
    with open('../Data/scraping_error_ids2.txt', 'a') as f:
        error_ids = [str(i) for i in error_ids]
        f.write('\n'.join(error_ids))

Successfully saved data for property ID: 78084989 with iter number: 26000
Successfully saved data for property ID: 78085529 with iter number: 26001
Successfully saved data for property ID: 78085711 with iter number: 26002
Successfully saved data for property ID: 78085785 with iter number: 26003
Successfully saved data for property ID: 78085909 with iter number: 26004
Successfully saved data for property ID: 78086249 with iter number: 26005
Successfully saved data for property ID: 78086261 with iter number: 26006
Successfully saved data for property ID: 78086421 with iter number: 26007
Successfully saved data for property ID: 78086501 with iter number: 26008
Successfully saved data for property ID: 78086513 with iter number: 26009
Successfully saved data for property ID: 78086547 with iter number: 26010
Successfully saved data for property ID: 78086673 with iter number: 26011
Successfully saved data for property ID: 78086771 with iter number: 26012
Successfully saved data for property I

## Final Scraping re-run for error_ids

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import json

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "../Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv", index_col='id')

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [18]:

# Initialize WebDriver
options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

with open('../Data/scraping_error_ids2.txt', 'r') as f:
    error_ids_old = f.read().splitlines()

error_ids_old = [int(i) for i in error_ids_old]
error_ids_old = list(set(error_ids_old))

actual_scraped_ids = [int(id.removesuffix('.json')) for id in os.listdir('../Data/propertyDetails/')]

error_ids = [id for id in error_ids_old if id not in actual_scraped_ids]
error_ids_new = []

In [19]:
len(error_ids)

3

In [20]:
driver = webdriver.Chrome(options=options)

try:
    for i, property_id in enumerate(error_ids):
        try:
            # Get property details
            property_url = df.loc[property_id]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            # Parse the page source with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Check if the body tag has class "error"
            body_tag = soup.find('body')
            if body_tag and 'error' in body_tag.get('class', []):
                print(f"\nError page detected for id: {property_id} and iter number: {i}\n")
                continue  # Skip further processing for this URL

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

        except Exception as e:
            error_ids_new.append(property_id)
            print(f"\nError processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids_new)}\nError Ids: {error_ids_new}")
    with open('../Data/scraping_error_ids2.txt', 'w') as f:
        error_ids_new = [str(i) for i in error_ids_new]
        f.write('\n'.join(error_ids_new))



Error page detected for id: 65319433 and iter number: 0


Error page detected for id: 77165277 and iter number: 1


Error page detected for id: 75570319 and iter number: 2

Total Error Ids: 0
Error Ids: []
