In [14]:
import os
import requests
import json
import time
import pandas as pd
from time import sleep
from math import ceil
from random import randint
from requests import HTTPError

In [19]:
def get_realtor_listings(page_number=1, max_pages=3):
    
    url = 'https://api2.realtor.ca/Listing.svc/PropertySearch_Post'

    headers = {
        'Content-Type': 'application/json',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }
    all_results = []

    for page in range(page_number, max_pages + 1):
        payload = {
            'CultureId': 1,
            'ApplicationId': 1,
            'RecordsPerPage': 10,
            'MaximumResults': 500,
            'PropertySearchTypeId': 1,
            'TransactionTypeId': 2,
            'StoreyRange': '0-0',
            'BedRange': '0-0',
            'BathRange': '0-0',
            'PriceMin': 0,
            'PriceMax': 0,
            'SortOrder': 'A',
            'SortBy': '1',
            'LongitudeMin': -123.40,
            'LongitudeMax': -122.80,
            'LatitudeMin': 49.15,
            'LatitudeMax': 49.35,
            'CurrentPage': page,
            'PropertyTypeGroupID': 1,
            'PropertyTypeId': [300],
            'City': 'Burnaby'
        }
        response = requests.post(url, headers=headers, data=json.dumps(payload))
        time.sleep(1)

        if response.status_code == 200:
            data = response.json()
            results = data.get('Results', [])
            
            for item in results:
                try:
                    prop = item['Property']
                    building = prop.get('Building', {})
                    address = prop.get('Address', {}).get('AddressText', 'N/A')
                    address = address.split(',')[0].strip()

                    entry = {
                        'Address': address,
                        'Price': item.get('Property', {}).get('Price', 'N/A'),
                        'Bedroooms': building.get('Bedrooms', 'N/A'),
                        'Bathrooms': building.get('Bathrooms', 'N,A'),
                        'SqFt': building.get('SizeInterior', 'N/A'),
                        'Parking': building.get('ParkingType', 'N/A'),
                        'Age': building.get('YearBuilt', 'N/A'),
                        'TimeOnMarketDays': max(1, item.get('RelativeTime', '').lower().count('day'))
                    }
                    all_results.append(entry)
                except Exception as e:
                    print(f"Error parsing property: {e}")
        
        else:
            print(f'Error: {response.status_code}')
            break
    
    return all_results

In [18]:
data = get_realtor_listings(page_number=1, max_pages=2)
df = pd.DataFrame(data)
df.head()

Error: 403


In [13]:
# THE FOLLOWING CODE IS FROM A PUBLIC GITHUB REPO FOR SCRAPING
# https://github.com/harry-s-grewal/mls-real-estate-scraper-for-realtor.ca

""" Contains all queries to the Realtor.ca API and OpenStreetMap."""
def get_coordinates(city):
    """Gets the coordinate bounds of a city from OpenStreetMap."""

    url = "https://nominatim.openstreetmap.org/search?q=" + city + "&format=json"
    response = requests.get(url=url, timeout=10)
    response.raise_for_status()
    data = response.json()
    for response in data:
        if (response["class"] == "boundary" and
                response["type"] == "administrative"):
            return response["boundingbox"]  # [latMin, latMax, lonMin, lonMax]
    return data

#pylint: disable=too-many-arguments
def get_property_list(
        lat_min, lat_max, long_min, long_max,
        price_min=0, price_max=10000000,
        records_per_page=200, culture_id=1,
        current_page=1, application_id=1):
    """Queries the Realtor.ca API to get a list of properties."""

    url = "https://api2.realtor.ca/Listing.svc/PropertySearch_Post"
    headers = {"Referer": "https://www.realtor.ca/",
               "Origin": "https://www.realtor.ca/",
               "Host": "api2.realtor.ca"}
    form = {
        "LatitudeMin": lat_min,
        "LatitudeMax": lat_max,
        "LongitudeMin": long_min,
        "LongitudeMax": long_max,
        "PriceMin": price_min,
        "PriceMax": price_max,
        "RecordsPerPage": records_per_page,
        "CultureId": culture_id,
        "CurrentPage": current_page,
        "ApplicationId": application_id
    }
    response = requests.post(url=url, headers=headers, data=form, timeout=10)
    if response.status_code == 403:
        print("Error 403: Rate limited")
    elif response.status_code != 200:
        print("Error " + str(response.status_code))
    response.raise_for_status()
    return response.json()


def get_property_details(property_id, mls_reference_number):
    """Queries the Realtor.ca API to get details of a property."""

    baseurl = "https://api2.realtor.ca/Listing.svc/PropertyDetails?ApplicationId=1&CultureId=1"
    url = baseurl + "&PropertyID=" + property_id + "&ReferenceNumber=" + mls_reference_number

    headers = {"Referer": "https://www.realtor.ca/",
               "Origin": "https://www.realtor.ca/",
               "Host": "api2.realtor.ca"}
    response = requests.get(url=url, headers=headers, timeout=10)
    if response.status_code == 403:
        print("Error 403: Rate limited")
    elif response.status_code != 200:
        print("Error " + str(response.status_code))
    response.raise_for_status()
    return response.json()

""" Wrapper the queries module to get property data from realtor.ca. """

def get_property_list_by_city(city):
    """ Gets a list of properties for a given city, and returns it as a CSV file. """

    coords = get_coordinates(city)  # Creates bounding box for city
    max_pages = 1
    current_page = 1
    filename = city.replace(" ", "").replace(",", "") + ".csv"
    if os.path.exists(filename):
        results_df = pd.read_csv(filename)
        ## If the queries were interrupted, this will resume from the last page
        current_page = ceil(results_df.shape[0]/200) + 1
        max_pages = current_page + 1
    else:
        results_df = pd.DataFrame()
    while current_page <= max_pages:
        try:
            data = get_property_list(
                coords[0], coords[1], 
                coords[2], coords[3],
                current_page=current_page)
            ## Rounds up the total records by the records per page to nearest int
            max_pages = ceil(data["Paging"]["TotalRecords"]/data["Paging"]["RecordsPerPage"])
            for json in data["Results"]:
                results_df = results_df.append(pd.json_normalize(json))
            results_df.to_csv(filename, index=False)
            current_page += 1
            sleep(randint(600, 900))  # sleep 10-15 minutes to avoid rate-limit
        except HTTPError:
            print("Error occurred on city: " + city)
            sleep(randint(3000, 3600))  # sleep for 50-60 minutes if limited


def get_property_details_from_csv(filename):
    """ Gets the details of a list of properties from the CSV file created above. """

    results_df = pd.read_csv(filename)
    if "HasDetails" not in results_df.columns:
        results_df["HasDetails"] = 0
    for index, row in results_df.iterrows():
        if row["HasDetails"] == 1: # Avoids re-querying properties that already have details
            continue
        property_id = str(row["Id"])
        mls_reference_number = str(row["MlsNumber"])
        try:
            data = get_property_details(property_id, mls_reference_number)
            results_df = results_df.join(pd.json_normalize(data), lsuffix='_')
            results_df.loc[index, 'HasDetails'] = 1
            results_df.to_csv(filename, index=False)
            sleep(randint(600, 900))  # sleep 10-15 minutes to avoid rate-limit
        except HTTPError:
            print("Error occurred on propertyID: " + property_id)
            sleep(randint(3000, 3600))  # sleep for 50-60 minutes if limited