### Code to extract query parameters from URL

In Chrome, I opened the magicbricks site and searched for the properties with desired filters like city, budget, sorting by price, etc. Then, after inspecting the network tab in dev tools, I found the API they use to request property details in JSON form. I used the same parameters to make requests to the API.

In [None]:
from urllib.parse import urlparse, parse_qs

# Original URL
url = "https://www.magicbricks.com/mbsrp/propertySearch.html?editSearch=Y&category=S&propertyType=10002,10003,10021,10022&budgetMax=45000000&city=4378&page=2&sortBy=Highest_Price&postedSince=-1&pType=10002,10003,10021,10022&isNRI=N&multiLang=en"

# Parse the URL
parsed_url = urlparse(url)

# Extract query parameters
query_params = parse_qs(parsed_url.query)

# Clean up the values (optional: since parse_qs returns values as lists)
query_params = {key: value[0] if len(value) == 1 else value for key, value in query_params.items()}

query_params


{'editSearch': 'Y',
 'category': 'S',
 'propertyType': '10002,10003,10021,10022',
 'budgetMax': '45000000',
 'city': '4378',
 'page': '2',
 'sortBy': 'Highest_Price',
 'postedSince': '-1',
 'pType': '10002,10003,10021,10022',
 'isNRI': 'N',
 'multiLang': 'en'}

## Scraping resultLists to get property details in bulk

After this data is scraped, the urls for each unique property are stored in a list. The list is then used to scrape the property details for each property separately.

In [111]:
import os
import time
import json
import logging
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

# Set up logging
logging.basicConfig(
    filename='scraping_errors.log',
    filemode='a',
    level=logging.ERROR,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Chrome options for headless browsing
chrome_options = Options()
# chrome_options.add_argument("--headless")  # Runs Chrome in headless mode.
chrome_options.add_argument('--no-sandbox')  # Bypass OS security model
chrome_options.add_argument('--disable-gpu')  # Applicable to Windows OS only

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)


In [112]:
# Base URL and query parameters
base_url = "http://www.magicbricks.com/mbsrp/propertySearch.html"

query_params = {'editSearch': 'Y',
 'category': 'S',
 'propertyType': '10002,10003,10021,10022',
 'budgetMax': '850000',
 'city': '4378',
 'page': '1',
 'sortBy': 'Highest_Price',
 'postedSince': '-1',
 'pType': '10002,10003,10021,10022',
 'isNRI': 'N',
 'multiLang': 'en'
}

In [None]:
# Create a directory for saving JSONs
os.makedirs('resultLists', exist_ok=True)

In [None]:
# Error tracking
failed_attempts = 0
MAX_FAILED_ATTEMPTS = 5

# Main scraping loop
page = 1
counter = 1  # For JSON file naming

# while True:
for _ in range(100):
    try:
        # Update page number in query parameters
        query_params['page'] = str(page)

        # Construct the URL with query parameters
        url_with_params = f"{base_url}?" + "&".join([f"{k}={v}" for k, v in query_params.items()])

        # Navigate to the URL
        driver.get(url_with_params)

        # Wait for a moment to ensure the page has loaded
        time.sleep(5)  # Adjust the sleep time if necessary

        # Retrieve the JSON content using JavaScript execution
        json_content = driver.execute_script("return document.body.innerText;")

        # Parse the JSON content
        response_json = json.loads(json_content)

        # Save JSON data
        file_name = f"Data/resultLists/list8LacMinus{page}.json"
        with open(file_name, 'w', encoding='utf-8') as json_file:
            json.dump(response_json, json_file, ensure_ascii=False, indent=4)

        print(f"Saved: {file_name}, with page={page}")
        counter += 1

        # Reset failed attempts counter on success
        failed_attempts = 0

        # Increment page
        page += 1

        # Wait between requests
        time.sleep(2)

    except Exception as e:
        # Log the error
        logging.error(f"Failed for page={page} with error: {e}")
        failed_attempts += 1

        # If too many failures, exit the loop
        if failed_attempts >= MAX_FAILED_ATTEMPTS:
            print("Too many consecutive failures. Stopping the scraping process.")
            break

        # Continue to next iteration
        continue

# Close the WebDriver
driver.quit()

### Code to check for number of unique listings

In [None]:
import os
import json

# Directory where JSON files are saved
data_directory = '../Data/resultLists'

# Initialize variables
all_ids = []  # To collect all property IDs
file_issues = []  # To track files with issues
duplicate_ids = set()  # To track duplicate IDs

all_urls = []
duplicate_urls = set()

# Process each JSON file
for file_name in sorted(os.listdir(data_directory)):
    if file_name.endswith('.json'):
        file_path = os.path.join(data_directory, file_name)

        try:
            # Read the JSON file
            with open(file_path, 'r', encoding='utf-8') as json_file:
                data = json.load(json_file)
            
            # Extract resultList
            result_list = data.get('resultList', [])

            # Check if resultList has exactly 30 items
            if len(result_list) != 30:
                file_issues.append((file_name, len(result_list)))

            # Extract IDs and check for duplicates
            ids_in_file = [property_data['id'] for property_data in result_list if property_data.get('price') is not None]
            for property_id in ids_in_file:
                if property_id in all_ids:
                    duplicate_ids.add(property_id)
                else:
                    all_ids.append(property_id)

            # Extract IDs and check for duplicates
            urls_in_file = [property_data['url'] for property_data in result_list if property_data.get('price') is not None]
            for property_url in urls_in_file:
                if property_url in all_urls:
                    duplicate_urls.add(property_url)
                else:
                    all_urls.append(property_url)
        
        except Exception as e:
            file_issues.append((file_name, f"Error reading file: {e}"))

# Check for duplicates
duplicate_ids_list = list(duplicate_ids)
duplicate_urls_list = list(duplicate_urls)

# Report results
print(f"Total JSON files processed: {len(os.listdir(data_directory))}")
print(f"Files with issues (not 30 listings or errors): {file_issues}")
print(f"Total unique IDs collected: {len(all_ids)}")
print(f"Duplicate IDs found: {len(duplicate_ids_list)}")

print(f"Total unique URLs collected: {len(all_urls)}")
print(f"Duplicate URLs found: {len(duplicate_urls_list)}")

if duplicate_ids_list:
    print("Duplicate IDs:")
    # print(duplicate_ids_list)
else:
    print("No duplicate IDs found.")

if duplicate_urls_list:
    print("Duplicate URLs:")
    # print(duplicate_urls_list)
else:    
    print("No duplicate URLs found.")


Total JSON files processed: 2555
Files with issues (not 30 listings or errors): [('list15CrPlus3.json', 2), ('list540LacPlus27.json', 13), ('list8LacMinus100.json', 0), ('list8LacMinus15.json', 6), ('list8LacMinus16.json', 0), ('list8LacMinus17.json', 0), ('list8LacMinus18.json', 0), ('list8LacMinus19.json', 0), ('list8LacMinus20.json', 0), ('list8LacMinus21.json', 0), ('list8LacMinus22.json', 0), ('list8LacMinus23.json', 0), ('list8LacMinus24.json', 0), ('list8LacMinus25.json', 0), ('list8LacMinus26.json', 0), ('list8LacMinus27.json', 0), ('list8LacMinus28.json', 0), ('list8LacMinus29.json', 0), ('list8LacMinus30.json', 0), ('list8LacMinus31.json', 0), ('list8LacMinus32.json', 0), ('list8LacMinus33.json', 0), ('list8LacMinus34.json', 0), ('list8LacMinus35.json', 0), ('list8LacMinus36.json', 0), ('list8LacMinus37.json', 0), ('list8LacMinus38.json', 0), ('list8LacMinus39.json', 0), ('list8LacMinus40.json', 0), ('list8LacMinus41.json', 0), ('list8LacMinus42.json', 0), ('list8LacMinus43.j

### Saving unique listings separately

In [None]:
import os
import json

# Directories
input_directory = '../Data/resultLists'  # Replace with your input directory path
unique_json_directory = '../Data/uniqueResults'  # Directory to save unique JSONs
combined_json_file = '../Data/uniqueResultsCombined.json'  # Path for combined JSON

# Ensure output directories exist
os.makedirs(unique_json_directory, exist_ok=True)

# Dictionary to store unique properties by their ID
unique_properties = {}

# Process each JSON file in the input directory
for filename in os.listdir(input_directory):
    if filename.endswith('.json'):
        file_path = os.path.join(input_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            try:
                data = json.load(file)
                result_list = data.get('resultList', [])
                for property in result_list:
                    property_id = property.get('id')
                    if property_id and property_id not in unique_properties:
                        # Add to unique properties
                        unique_properties[property_id] = property

                        # Save this property as a separate JSON file
                        unique_file_path = os.path.join(unique_json_directory, f"{property_id}.json")
                        with open(unique_file_path, 'w', encoding='utf-8') as unique_file:
                            json.dump(property, unique_file, indent=4)
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON in file {filename}: {e}")

# Save the combined JSON
with open(combined_json_file, 'w', encoding='utf-8') as combined_file:
    json.dump(unique_properties, combined_file, indent=4)

print(f"Processing complete. Unique JSONs saved to {unique_json_directory}. Combined JSON saved to {combined_json_file}.")


Processing complete. Unique JSONs saved to scraped_data2. Combined JSON saved to combined.json.


### Extracting list of property ids and urls for further scraping

In [None]:
import json
import pandas as pd

# Path to the combined JSON file and output CSV file
combined_json_file = '../Data/uniqueResultsCombined.json'  # Replace with the actual path
csv_output_file = '../Data/idsAndUrls.csv'  # Replace with the desired CSV file path

# Load the combined JSON file
with open(combined_json_file, 'r', encoding='utf-8') as file:
    combined_data = json.load(file)

# Prepare data for the CSV
csv_data = [{'id': prop_id, 'url': prop_data.get('url')} for prop_id, prop_data in combined_data.items()]

# Create a DataFrame and save it as a CSV
df = pd.DataFrame(csv_data)
df.to_csv(csv_output_file, index=False)

print(f"CSV file with id and url has been created at {csv_output_file}.")


CSV file with id and url has been created at properties.csv.


## Scraping details from each property's webpage

The data obtained from the resultLists scraping is incomplete. However, it contains url for each property. We can use the url to scrape the details of each property.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import json
import os
import pandas as pd
import time

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "../Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv")

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [None]:
# Initialize WebDriver
options = webdriver.ChromeOptions()
# options.add_argument("--headless")  # Run in headless mode
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

with open('error_ids.txt', 'r') as f:
    error_ids = f.read().splitlines()

try:
    for i in range(29999, 33889):
        try:
            # Get property details
            property_id = df.iloc[i]['id']
            property_url = df.iloc[i]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"Successfully saved data for property ID: {property_id} with iter number: {i}")

            # time.sleep(1)

        except Exception as e:
            error_ids.append(property_id)
            print(f"Error processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids)}\nError Ids: {error_ids}")
    with open('error_ids.txt', 'a') as f:
        error_ids = [str(i) for i in error_ids]
        f.write('\n'.join(error_ids))

## Final Scraping re-run for error_ids

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import os
import json
from webdriver_manager.chrome import ChromeDriverManager

# Base variables
base_url = "https://www.magicbricks.com/propertyDetails/"
output_directory = "Data/propertyDetails"

# Load DataFrame
df = pd.read_csv("../Data/idsAndUrls.csv", index_col='id')

# Ensure output directory exists
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

In [None]:

# Initialize WebDriver
options = webdriver.ChromeOptions()

# options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

with open('error_ids.txt', 'r') as f:
    error_ids = f.read().splitlines()

error_ids = [int(i) for i in error_ids]
error_ids = list(set(error_ids))


try:
    for i, property_id in enumerate(error_ids):
        try:
            # Get property details
            property_url = df.loc[property_id]['url']
            full_url = f"{base_url}{property_url}"

            # Navigate to the URL
            driver.get(full_url)

            # Parse the page source with BeautifulSoup
            page_source = driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')

            # Check if the body tag has class "error"
            body_tag = soup.find('body')
            if body_tag and 'error' in body_tag.get('class', []):
                print(f"Error page detected for id: {property_id} and iter number: {i}")
                continue  # Skip further processing for this URL

            # Wait for the JavaScript variable to load
            wait = WebDriverWait(driver, 10)
            property_details = wait.until(
                lambda d: d.execute_script("return window.SERVER_PRELOADED_STATE_DETAILS")
            )

            # Save the JSON data to a file
            output_file = os.path.join(output_directory, f"{property_id}.json")
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(property_details, f, indent=4)

            print(f"\nSuccessfully saved data for property ID: {property_id} with iter number: {i}\n")

        except Exception as e:
            error_ids.append(property_id) if property_id not in error_ids else None
            print(f"Error processing iteration: {i} with property ID {property_id}: {e}\nURL: {full_url}\n\n")

finally:
    # Close the driver
    driver.quit()
    print(f"Total Error Ids: {len(error_ids)}\nError Ids: {error_ids}")
    with open('error_ids.txt', 'w') as f:
        error_ids = [str(i) for i in error_ids]
        f.write('\n'.join(error_ids))


In [1]:
import json

# Load the GeoJSON file
input_file = r"Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\Pune_Landmarks_and_Localities.geojson"  # Replace with your file name
with open(input_file, 'r', encoding='utf-8') as file:
    geojson_data = json.load(file)

# Initialize separate lists for points and polygons
points = []
polygons = []

# Iterate through the GeoJSON features
for feature in geojson_data['features']:
    geometry_type = feature['geometry']['type']
    if geometry_type == "Point":
        points.append(feature)
    elif geometry_type in ["Polygon", "MultiPolygon"]:
        polygons.append(feature)

# Create GeoJSON structures for points and polygons
points_geojson = {
    "type": "FeatureCollection",
    "features": points
}

polygons_geojson = {
    "type": "FeatureCollection",
    "features": polygons
}

# Save the points and polygons GeoJSON to separate files
points_output_file = r"Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPoints.json"
polygons_output_file = r"Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPolygons.json"

with open(points_output_file, 'w') as file:
    json.dump(points_geojson, file, indent=4)

with open(polygons_output_file, 'w') as file:
    json.dump(polygons_geojson, file, indent=4)

print(f"Points saved to {points_output_file}")
print(f"Polygons saved to {polygons_output_file}")


Points saved to Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPoints.json
Polygons saved to Data\MiscellaneousResources\OpenStreetMapsData\Pune_Landmarks_and_Localities_geojson\LandmarkPolygons.json
