### Code to extract query parameters from URL

In Chrome, I opened the magicbricks site and searched for the properties with desired filters like city, budget, sorting by price, etc. Then, after inspecting the network tab in dev tools, I found the API they use to request property details in JSON form. I used the same parameters to make requests to the API.

In [1]:
from urllib.parse import urlparse, parse_qs

# Original URL
url = "https://www.magicbricks.com/mbsrp/propertySearch.html?editSearch=Y&category=S&propertyType=10002&budgetMax=45000000&bedrooms=11700,11701,11702,11703,11704,11705,11706,11707,11708,11709,11710&city=4378&page=2&sortBy=Highest_Price&postedSince=-1&pType=10002&isNRI=N&multiLang=en"

# Parse the URL
parsed_url = urlparse(url)

# Extract query parameters
query_params = parse_qs(parsed_url.query)

# Clean up the values (optional: since parse_qs returns values as lists)
query_params = {key: value[0] if len(value) == 1 else value for key, value in query_params.items()}

query_params


{'editSearch': 'Y',
 'category': 'S',
 'propertyType': '10002',
 'budgetMax': '45000000',
 'bedrooms': '11700,11701,11702,11703,11704,11705,11706,11707,11708,11709,11710',
 'city': '4378',
 'page': '2',
 'sortBy': 'Highest_Price',
 'postedSince': '-1',
 'pType': '10002',
 'isNRI': 'N',
 'multiLang': 'en'}

## Scraping resultLists to get property details in bulk

After this data is scraped, the urls for each unique property are stored in a list. The list is then used to scrape the property details for each property separately.

In [1]:
import os
import time
import json
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
# from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

# Set up logging
logging.basicConfig(
    filename='scraping_errors.log',
    filemode='a',
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Chrome options for headless browsing
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Runs Chrome in headless mode.
chrome_options.add_argument('--no-sandbox')  # Bypass OS security model
chrome_options.add_argument('--disable-gpu')  # Applicable to Windows OS only
chrome_options.page_load_strategy = "eager"


In [8]:
# Base URL and query parameters
base_url = "http://www.magicbricks.com/mbsrp/propertySearch.html"

query_params = {'editSearch': 'Y',
 'category': 'S',
 'propertyType': '10002',
 'budgetMax': '10,00,000',
 'bedrooms': '11700,11701,11702,11703,11704,11705,11706,11707,11708,11709,11710',
 'city': '4378',
 'page': '1',
 'sortBy': 'Highest_Price',
 'postedSince': '-1',
 'pType': '10002',
 'isNRI': 'N',
 'multiLang': 'en'}
query_params['budgetMax'] = query_params['budgetMax'].replace(',', '')
query_params

{'editSearch': 'Y',
 'category': 'S',
 'propertyType': '10002',
 'budgetMax': '1000000',
 'bedrooms': '11700,11701,11702,11703,11704,11705,11706,11707,11708,11709,11710',
 'city': '4378',
 'page': '1',
 'sortBy': 'Highest_Price',
 'postedSince': '-1',
 'pType': '10002',
 'isNRI': 'N',
 'multiLang': 'en'}

In [None]:
# Create a directory for saving JSONs
os.makedirs('../Data/resultLists', exist_ok=True)

In [None]:
# Error tracking
failed_attempts = 0
MAX_FAILED_ATTEMPTS = 5

# Main scraping loop
page = 1
counter = 1  # For JSON file naming

# Initialize the WebDriver
driver = webdriver.Chrome(options=chrome_options)

# while True:
for _ in range(50):
    try:
        # Update page number in query parameters
        query_params['page'] = str(page)

        # Construct the URL with query parameters
        url_with_params = f"{base_url}?" + "&".join([f"{k}={v}" for k, v in query_params.items()])

        # Navigate to the URL
        # print('Requesting URL:', url_with_params)
        driver.get(url_with_params)

        # Wait for a moment to ensure the page has loaded
        time.sleep(3)  # Adjust the sleep time if necessary

        # Retrieve the JSON content using JavaScript execution
        json_content = driver.execute_script("return document.body.innerText;")

        # Parse the JSON content
        response_json = json.loads(json_content)

        # Save JSON data
        file_name = f"../Data/resultLists/list10LakhMinus{page}.json"
        with open(file_name, 'w', encoding='utf-8') as json_file:
            json.dump(response_json, json_file, ensure_ascii=False, indent=4)

        print(f"Saved: {file_name}, with page={page}")
        counter += 1

        # Reset failed attempts counter on success
        failed_attempts = 0

        # Increment page
        page += 1

        # Wait between requests
        # time.sleep(2)

    except Exception as e:
        # Log the error
        logging.error(f"Failed for page={page} with error: {e}")
        failed_attempts += 1

        # If too many failures, exit the loop
        if failed_attempts >= MAX_FAILED_ATTEMPTS:
            print("Too many consecutive failures. Stopping the scraping process.")
            break

        # Continue to next iteration
        continue

# Close the WebDriver
driver.quit()

Saved: Data/resultLists/list10LakhMinus1.json, with page=1
Saved: Data/resultLists/list10LakhMinus2.json, with page=2
Saved: Data/resultLists/list10LakhMinus3.json, with page=3
Saved: Data/resultLists/list10LakhMinus4.json, with page=4
Saved: Data/resultLists/list10LakhMinus5.json, with page=5
Saved: Data/resultLists/list10LakhMinus6.json, with page=6
Saved: Data/resultLists/list10LakhMinus7.json, with page=7
Saved: Data/resultLists/list10LakhMinus8.json, with page=8
Saved: Data/resultLists/list10LakhMinus9.json, with page=9
Saved: Data/resultLists/list10LakhMinus10.json, with page=10
Saved: Data/resultLists/list10LakhMinus11.json, with page=11
Saved: Data/resultLists/list10LakhMinus12.json, with page=12
Saved: Data/resultLists/list10LakhMinus13.json, with page=13
Saved: Data/resultLists/list10LakhMinus14.json, with page=14
Saved: Data/resultLists/list10LakhMinus15.json, with page=15
Saved: Data/resultLists/list10LakhMinus16.json, with page=16
Saved: Data/resultLists/list10LakhMinus17.

### Code to check for number of unique listings

In [None]:
import os
import json

# Directory where JSON files are saved
data_directory = '../Data/resultLists'

# Initialize variables
all_ids = []  # To collect all property IDs
file_issues = []  # To track files with issues
duplicate_ids = set()  # To track duplicate IDs

all_urls = []
duplicate_urls = set()

# Process each JSON file
for file_name in sorted(os.listdir(data_directory)):
    if file_name.endswith('.json'):
        file_path = os.path.join(data_directory, file_name)

        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
            
            # Extract resultList
            result_list = data.get('resultList', [])

            # Check if resultList has exactly 30 items
            if len(result_list) != 30:
                file_issues.append((file_name, len(result_list)))

            # Extract IDs and check for duplicates
            ids_in_file = [property_data['id'] for property_data in result_list if property_data.get('price') is not None]
            for property_id in ids_in_file:
                if property_id in all_ids:
                    duplicate_ids.add(property_id)
                else:
                    all_ids.append(property_id)

            # Extract IDs and check for duplicates
            urls_in_file = [property_data['url'] for property_data in result_list if property_data.get('price') is not None]
            for property_url in urls_in_file:
                if property_url in all_urls:
                    duplicate_urls.add(property_url)
                else:
                    all_urls.append(property_url)
        
        except Exception as e:
            file_issues.append((file_name, f"Error reading file: {e}"))

# Check for duplicates
duplicate_ids_list = list(duplicate_ids)
duplicate_urls_list = list(duplicate_urls)

# Report results
print(f"Total JSON files processed: {len(os.listdir(data_directory))}")
print(f"Files with issues (not 30 listings or errors): {file_issues}")
print(f"Total unique IDs collected: {len(all_ids)}")
print(f"Duplicate IDs found: {len(duplicate_ids_list)}")

print(f"Total unique URLs collected: {len(all_urls)}")
print(f"Duplicate URLs found: {len(duplicate_urls_list)}")

if duplicate_ids_list:
    print("Duplicate IDs:")
    # print(duplicate_ids_list)
else:
    print("No duplicate IDs found.")

if duplicate_urls_list:
    print("Duplicate URLs:")
    # print(duplicate_urls_list)
else:    
    print("No duplicate URLs found.")


Total JSON files processed: 1275
Files with issues (not 30 listings or errors): [('list10LakhMinus10.json', 16), ('list10LakhMinus11.json', 0), ('list10LakhMinus12.json', 0), ('list10LakhMinus13.json', 0), ('list10LakhMinus14.json', 0), ('list10LakhMinus15.json', 0), ('list10LakhMinus16.json', 0), ('list10LakhMinus17.json', 0), ('list10LakhMinus18.json', 0), ('list10LakhMinus19.json', 0), ('list10LakhMinus20.json', 0), ('list10LakhMinus21.json', 0), ('list10LakhMinus22.json', 0), ('list10LakhMinus23.json', 0), ('list10LakhMinus24.json', 0), ('list10LakhMinus25.json', 0), ('list10LakhMinus26.json', 0), ('list10LakhMinus27.json', 0), ('list10LakhMinus28.json', 0), ('list10LakhMinus29.json', 0), ('list10LakhMinus30.json', 0), ('list10LakhMinus31.json', 0), ('list10LakhMinus32.json', 0), ('list10LakhMinus33.json', 0), ('list10LakhMinus34.json', 0), ('list10LakhMinus35.json', 0), ('list10LakhMinus36.json', 0), ('list10LakhMinus37.json', 0), ('list10LakhMinus38.json', 0), ('list10LakhMinus39

### Saving unique listings separately

In [None]:
import os
import json

# Directories
input_directory = '../Data/resultLists'  # Replace with your input directory path
unique_json_directory = '../Data/uniqueResults'  # Directory to save unique JSONs
combined_json_file = '../Data/uniqueResultsCombined.json'  # Path for combined JSON

# Ensure output directories exist
os.makedirs(unique_json_directory, exist_ok=True)

# Dictionary to store unique properties by their ID
unique_properties = []

# Process each JSON file in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(input_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)
                result_list = data.get('resultList', [])
                for property in result_list:
                    property_id = property.get('id')
                    if property_id and property_id not in unique_properties:
                        # Add to unique properties
                        unique_properties.append(property_id)

                        # Save this property as a separate JSON file
                        unique_file_path = os.path.join(unique_json_directory, f"{property_id}.json")
                        with open(unique_file_path, 'w', encoding='utf-8') as unique_file:
                            json.dump(property, unique_file, indent=4)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")

print(f"Processing complete. Unique JSONs saved to {unique_json_directory}.")


Processing complete. Unique JSONs saved to Data/uniqueResults.


### Extracting list of property ids and urls for further scraping

In [None]:
import json
import pandas as pd

# Path to the combined JSON file and output CSV file
input_directory = '../Data/uniqueResults' 
csv_output_file = '../Data/idsAndUrls.csv'  # Replace with the desired CSV file path

# Prepare data for the CSV
csv_data = []

for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(input_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
        csv_data.append({'id': data['id'], 'url': data['url']})

# Create a DataFrame and save it as a CSV
df = pd.DataFrame(csv_data)
df.to_csv(csv_output_file, index=False)

print(f"CSV file with id and url has been created at {csv_output_file}.")


CSV file with id and url has been created at Data/idsAndUrls.csv.


## Scraping details from each property's webpage

The data obtained from the resultLists scraping is incomplete. However, it contains url for each property. We can use the url to scrape the details of each property.

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
import json
import os
import pandas as pd
import time

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "../Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv")

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [13]:
# Initialize WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)

with open('../Data/scraping_error_ids1.txt', 'r') as f:
    error_ids = f.read().splitlines()

try:
    for i in range(28000, df.shape[0]+1):
        try:
            # Get property details
            property_id = df.iloc[i]['id']
            property_url = df.iloc[i]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            time.sleep(0.5)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

        except Exception as e:
            error_ids.append(property_id)
            print(f"Error processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids)}\nError Ids: {error_ids}")
    with open('../Data/scraping_error_ids1.txt', 'a') as f:
        error_ids = [str(i) for i in error_ids]
        f.write('\n'.join(error_ids))

Successfully saved data for property ID: 78486779 with iter number: 28000
Successfully saved data for property ID: 78486793 with iter number: 28001
Successfully saved data for property ID: 78486801 with iter number: 28002
Successfully saved data for property ID: 78486857 with iter number: 28003
Successfully saved data for property ID: 78486871 with iter number: 28004
Successfully saved data for property ID: 78486977 with iter number: 28005
Successfully saved data for property ID: 78486999 with iter number: 28006
Successfully saved data for property ID: 78487059 with iter number: 28007
Successfully saved data for property ID: 78487067 with iter number: 28008
Successfully saved data for property ID: 78487131 with iter number: 28009
Successfully saved data for property ID: 78487255 with iter number: 28010
Successfully saved data for property ID: 78487271 with iter number: 28011
Successfully saved data for property ID: 78487275 with iter number: 28012
Successfully saved data for property I

## Final Scraping re-run for error_ids

In [38]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import os
import json

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "../Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv", index_col='id')

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [39]:

# Initialize WebDriver
options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

with open('../Data/scraping_error_ids1.txt', 'r') as f:
    error_ids_old = f.read().splitlines()

error_ids_old = [int(i) for i in error_ids_old]
error_ids_old = list(set(error_ids_old))

actual_scraped_ids = [int(id.removesuffix('.json')) for id in os.listdir('../Data/propertyDetails/')]

error_ids = [id for id in error_ids_old if id not in actual_scraped_ids]
error_ids_new = []

In [40]:
len(error_ids)

3

In [41]:

driver = webdriver.Chrome(options=options)

try:
    for i, property_id in enumerate(error_ids):
        try:
            # Get property details
            property_url = df.loc[property_id]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "body"))
            )

            # Parse the page source with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Check if the body tag has class "error"
            body_tag = soup.find('body')
            if body_tag and 'error' in body_tag.get('class', []):
                print(f"\nError page detected for id: {property_id} and iter number: {i}\n")
                continue  # Skip further processing for this URL

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

        except Exception as e:
            error_ids_new.append(property_id)
            print(f"\nError processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids_new)}\nError Ids: {error_ids_new}")
    with open('../Data/scraping_error_ids1.txt', 'w') as f:
        error_ids_new = [str(i) for i in error_ids_new]
        f.write('\n'.join(error_ids_new))



Error page detected for id: 78062815 and iter number: 0


Error page detected for id: 47400679 and iter number: 1


Error page detected for id: 76571407 and iter number: 2

Total Error Ids: 0
Error Ids: []


In [1]:
import json

# Load the GeoJSON file
input_file = r"Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\Pune_Landmarks_and_Localities.geojson"  # Replace with your file name
with open(input_file, 'r', encoding='utf-8') as file:
    geojson_data = json.load(file)

# Initialize separate lists for points and polygons
points = []
polygons = []

# Iterate through the GeoJSON features
for feature in geojson_data['features']:
    geometry_type = feature['geometry']['type']
    if geometry_type == "Point":
        points.append(feature)
    elif geometry_type in ["Polygon", "MultiPolygon"]:
        polygons.append(feature)

# Create GeoJSON structures for points and polygons
points_geojson = {
    "type": "FeatureCollection",
    "features": points
}

polygons_geojson = {
    "type": "FeatureCollection",
    "features": polygons
}

# Save the points and polygons GeoJSON to separate files
points_output_file = r"Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPoints.json"
polygons_output_file = r"Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPolygons.json"

with open(points_output_file, 'w') as file:
    json.dump(points_geojson, file, indent=4)

with open(polygons_output_file, 'w') as file:
    json.dump(polygons_geojson, file, indent=4)

print(f"Points saved to {points_output_file}")
print(f"Polygons saved to {polygons_output_file}")


Points saved to Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPoints.json
Polygons saved to Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPolygons.json
