## Scraping details from each property's webpage

The data obtained from the resultLists scraping is incomplete. However, it contains url for each property. We can use the url to scrape the details of each property.

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
import json
import os
import pandas as pd
import time

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "../Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv")

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [8]:
# Initialize WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)

with open('../Data/scraping_error_ids3.txt', 'r') as f:
    error_ids = f.read().splitlines()

try:
    for i in range(23839, 24000):
        try:
            # Get property details
            property_id = df.iloc[i]['id']
            property_url = df.iloc[i]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            time.sleep(1)

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

            # time.sleep(1)

        except Exception as e:
            error_ids.append(property_id)
            print(f"Error processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids)}\nError Ids: {error_ids}")
    with open('../Data/scraping_error_ids3.txt', 'a') as f:
        error_ids = [str(i) for i in error_ids]
        f.write('\n'.join(error_ids))

Successfully saved data for property ID: 77724291 with iter number: 23839
Successfully saved data for property ID: 77724579 with iter number: 23840
Error processing iteration: 23841 with property ID 77725075: Message: 

URL: https://www.magicbricks.com/propertyDetails/2-BHK-1336-Sq-ft-Multistorey-Apartment-FOR-Sale-Wagholi-in-Pune&id=4d423737373235303735


Successfully saved data for property ID: 77725327 with iter number: 23842
Successfully saved data for property ID: 77725661 with iter number: 23843
Successfully saved data for property ID: 77725699 with iter number: 23844
Successfully saved data for property ID: 77725897 with iter number: 23845
Successfully saved data for property ID: 77726033 with iter number: 23846
Successfully saved data for property ID: 77726081 with iter number: 23847
Successfully saved data for property ID: 77726339 with iter number: 23848
Successfully saved data for property ID: 77726415 with iter number: 23849
Successfully saved data for property ID: 77726533

## Final Scraping re-run for error_ids

In [13]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import json

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "../Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv", index_col='id')

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [14]:

# Initialize WebDriver
options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

with open('../Data/scraping_error_ids3.txt', 'r') as f:
    error_ids_old = f.read().splitlines()

error_ids_old = [int(i) for i in error_ids_old]
error_ids_old = list(set(error_ids_old))

actual_scraped_ids = [int(id.removesuffix('.json')) for id in os.listdir('../Data/propertyDetails/')]

error_ids = [id for id in error_ids_old if id not in actual_scraped_ids]
error_ids_new = []

In [15]:
len(error_ids)

100

In [16]:

driver = webdriver.Chrome(options=options)

try:
    for i, property_id in enumerate(error_ids):
        try:
            # Get property details
            property_url = df.loc[property_id]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            # Parse the page source with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Check if the body tag has class "error"
            body_tag = soup.find('body')
            if body_tag and 'error' in body_tag.get('class', []):
                print(f"\nError page detected for id: {property_id} and iter number: {i}\n")
                continue  # Skip further processing for this URL

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

        except Exception as e:
            error_ids_new.append(property_id)
            print(f"\nError processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids_new)}\nError Ids: {error_ids_new}")
    with open('../Data/scraping_error_ids3.txt', 'w') as f:
        error_ids_new = [str(i) for i in error_ids_new]
        f.write('\n'.join(error_ids_new))



Error page detected for id: 77536257 and iter number: 0


Error page detected for id: 77607429 and iter number: 1


Error page detected for id: 77334549 and iter number: 2


Error page detected for id: 77578773 and iter number: 3


Error page detected for id: 77606433 and iter number: 4


Error page detected for id: 77565481 and iter number: 5


Error page detected for id: 77591595 and iter number: 6


Error page detected for id: 77583917 and iter number: 7


Error page detected for id: 77588527 and iter number: 8


Error page detected for id: 77581361 and iter number: 9


Error page detected for id: 77636151 and iter number: 10


Error page detected for id: 77623865 and iter number: 11


Error page detected for id: 77614649 and iter number: 12


Error page detected for id: 77612097 and iter number: 13


Error page detected for id: 77432907 and iter number: 14

Successfully saved data for property ID: 77562957 with iter number: 15

Error page detected for id: 77636685 and iter number: