In [1]:
# The followinig code defines a function where the user enter a path and then retrieves a list with dictionaries for each porperty
# Important to check the classes for the cards and the porperty details becuase sometimes they change and then the script wont run

In [2]:
from bs4 import BeautifulSoup

def extract_property_data(file_path):
    """
    Extracts property data from an HTML file and returns a list of dictionaries with the data.

    Parameters:
    - file_path (str): Path to the HTML file to be processed.

    Returns:
    - List[dict]: A list of dictionaries, each containing data for a property.
    """
    # Initialize the list to store property information
    all_property_info = []

    # Open and parse the HTML file
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
    
    # Find all property cards
    property_cards = soup.find_all('li', class_='ListItem-c11n-8-107-0__sc-13rwu5a-0 StyledListCardWrapper-srp-8-107-0__sc-wtsrtn-0 dAZKuw xoFGK')
    print(f"Number of property cards found in {file_path}: {len(property_cards)}")

    # Iterate over each property card and extract information
    for card in property_cards:
        property_info = {}

        # Extract the price (if present)
        price_tag = card.find('span', {'data-test': 'property-card-price'})
        if price_tag:
            property_info['Price'] = price_tag.get_text(strip=True)

        # Extract the property details (bedrooms, bathrooms, size)
        
        property_details_tag = card.find('div', class_='StyledPropertyCardDataArea-c11n-8-107-0__sc-10i1r6-0 eLqtVY')
        
        # This part of the code generates a line with all the information needed that is going to be cleaned after
        if property_details_tag:
            property_details = property_details_tag.get_text(strip=True)
            property_info['Property Details'] = property_details


        # Extract the address (if present)
        address_tag = card.find('address', {'data-test': 'property-card-addr'})
        if address_tag:
            property_info['Address'] = address_tag.get_text(strip=True)

        # Add the extracted information to the list if not empty
        if property_info:
            all_property_info.append(property_info)

    # Return the list of extracted property information
    return all_property_info

# Example usage:
file_path = './DataHTML/3.html'
property_data = extract_property_data(file_path)
print(property_data)



Number of property cards found in ./DataHTML/3.html: 40
[{'Price': 'C$3,488,000', 'Property Details': '4bds3ba2,334sqft- House for sale', 'Address': '3688 W 49th Ave, Vancouver, BC V6N 3T8'}, {'Price': 'C$1,250,000', 'Property Details': '3bds3ba1,150sqft- Townhouse for sale', 'Address': '1516 W 61st Ave, Vancouver, BC V6P 2B4'}, {'Price': 'C$1,699,999', 'Property Details': '2bds2ba1,349sqft- Condo for sale', 'Address': '833 Seymour St #3702, Vancouver, BC V6B 0G4'}, {'Price': 'C$1,199,000', 'Property Details': '3bds4ba1,580sqft- Townhouse for sale', 'Address': '328 Semlin Dr, Vancouver, BC V5L 0A6'}, {'Price': 'C$15,700,000', 'Property Details': '7bds8ba6,912sqft- House for sale', 'Address': '6615 Balsam St, Vancouver, BC V6P 5W8'}, {'Price': 'C$1,449,900', 'Property Details': '6bds4ba2,050sqft- House for sale', 'Address': '3746 Inverness St, Vancouver, BC V5V 4V9'}, {'Price': 'C$999,999', 'Property Details': '3bds2ba1,199sqft- Townhouse for sale', 'Address': '2699 Duke St, Vancouver, 

In [3]:
# Here define a function that iterates over a series of files and run the function defined above, so it can scrappe 
# multiple pages at the same time and retrieve a final dataset

In [4]:
import os

# Assuming extract_property_data is defined as above
def extract_all_properties(directory, num_files):
    """
    Extracts property data from multiple HTML files and combines the results into one list.

    Parameters:
    - directory (str): The directory where HTML files are located.
    - num_files (int): The number of files to process (e.g., for 1.html to 10.html, set num_files=10).

    Returns:
    - List[dict]: A combined list of dictionaries containing all extracted property data.
    """
    all_properties_combined = []

    # Iterate through each file in the specified range
    for i in range(1, num_files + 1):
        file_path = os.path.join(directory, f"{i}.html")
        
        # Extract data from the current file
        property_data = extract_property_data(file_path)
        
        print(f"Number of properties in {file_path}: {len(property_data)}")  # Check properties count per file
        
        # Extend the combined list with data from the current file
        all_properties_combined.extend(property_data)

    return all_properties_combined

# Example usage
directory = './DataHTML'  # Update with the path to your HTML files
num_files = 22
all_property_data = extract_all_properties(directory, num_files)

# Output all combined data
print(all_property_data)
print(len(all_property_data))


Number of property cards found in ./DataHTML\1.html: 40
Number of properties in ./DataHTML\1.html: 40
Number of property cards found in ./DataHTML\2.html: 40
Number of properties in ./DataHTML\2.html: 40
Number of property cards found in ./DataHTML\3.html: 40
Number of properties in ./DataHTML\3.html: 40
Number of property cards found in ./DataHTML\4.html: 40
Number of properties in ./DataHTML\4.html: 40
Number of property cards found in ./DataHTML\5.html: 40
Number of properties in ./DataHTML\5.html: 40
Number of property cards found in ./DataHTML\6.html: 40
Number of properties in ./DataHTML\6.html: 40
Number of property cards found in ./DataHTML\7.html: 40
Number of properties in ./DataHTML\7.html: 40
Number of property cards found in ./DataHTML\8.html: 40
Number of properties in ./DataHTML\8.html: 40
Number of property cards found in ./DataHTML\9.html: 40
Number of properties in ./DataHTML\9.html: 40
Number of property cards found in ./DataHTML\10.html: 40
Number of properties in .

In [5]:
import re
import pandas as pd

# List to store cleaned property info
all_property_info_cleaned = []

# Regex patterns for extracting numbers
price_pattern = r'C\$(\d{1,3}(?:,\d{3})*(?:\.\d{2})?)'
bedroom_pattern = r'(\d+)bd'
bathroom_pattern = r'(\d+)ba'
size_pattern = r'(\d{1,3}(?:,\d{3})?)\s?sqft'
property_type_pattern = r'sqft-(.*)'
postal_code_pattern = r'([A-Za-z]\d[A-Za-z])\s?(\d[A-Za-z]\d)'  # Pattern for Canadian postal code

# Loop through the property info and clean the data
for property_info in all_property_data:
    # Clean price by removing 'C$' and keeping only the number
    price = re.search(price_pattern, property_info['Price'])
    price_cleaned = price.group(1).replace(",", "") if price else None
    
    # Check if the property is a studio
    if "Studio" in property_info['Property Details']:
        bedrooms_cleaned = "0"  # Assign 0 to bedrooms for studios
    else:
        # Extract bedroom count
        bedrooms = re.search(bedroom_pattern, property_info['Property Details'])
        bedrooms_cleaned = bedrooms.group(1) if bedrooms else None
    
    # Extract bathroom count
    bathrooms = re.search(bathroom_pattern, property_info['Property Details'])
    bathrooms_cleaned = bathrooms.group(1) if bathrooms else None
    
    # Extract size in sqft
    size = re.search(size_pattern, property_info['Property Details'])
    size_cleaned = size.group(1) if size else None
    
    # Extract property type (everything after 'sqft-')
    property_type = re.search(property_type_pattern, property_info['Property Details'])
    property_type_cleaned = property_type.group(1).strip() if property_type else None

    # Extract postal code from the address
    postal_code_match = re.search(postal_code_pattern, property_info['Address'])
    postal_code = postal_code_match.group(0) if postal_code_match else None
    
    # Append cleaned data to the list
    cleaned_property_info = {
        'Price': price_cleaned,
        'Property Details': property_info['Property Details'],
        'Bedrooms': bedrooms_cleaned,
        'Bathrooms': bathrooms_cleaned,
        'Size': size_cleaned,
        'Property Type': property_type_cleaned,  
        'Postal Code': postal_code
    }
    all_property_info_cleaned.append(cleaned_property_info)

# Create a DataFrame from the cleaned data
df_cleaned_properties = pd.DataFrame(all_property_info_cleaned)

# Display the cleaned data as a DataFrame
print(df_cleaned_properties)





       Price                              Property Details Bedrooms Bathrooms  \
0    1950000              4bds3ba2,567sqft- House for sale        4         3   
1     375000                 1bd1ba484sqft- House for sale        1         1   
2    4600000              6bds3ba2,874sqft- House for sale        6         3   
3    4600000              6bds3ba3,120sqft- House for sale        6         3   
4    1799000  3bds4ba1,607sqft- Multi-family home for sale        3         4   
..       ...                                           ...      ...       ...   
875  1999000          3bds4ba1,851sqft- Townhouse for sale        3         4   
876  3888000              3bds3ba1,825sqft- Condo for sale        3         3   
877  7999900              3bds4ba4,348sqft- Condo for sale        3         4   
878  1628000  3bds4ba1,495sqft- Multi-family home for sale        3         4   
879  4198000              5bds7ba3,558sqft- House for sale        5         7   

      Size               Pr

In [6]:
# Filter out entries where Bathrooms is None or empty
df_cleaned_properties = df_cleaned_properties.dropna()

# Export the filtered DataFrame to a CSV file
df_cleaned_properties.to_csv("cleaned_properties2.csv", index=False)

# Display the filtered data
print(df_cleaned_properties)



       Price                              Property Details Bedrooms Bathrooms  \
0    1950000              4bds3ba2,567sqft- House for sale        4         3   
1     375000                 1bd1ba484sqft- House for sale        1         1   
2    4600000              6bds3ba2,874sqft- House for sale        6         3   
3    4600000              6bds3ba3,120sqft- House for sale        6         3   
4    1799000  3bds4ba1,607sqft- Multi-family home for sale        3         4   
..       ...                                           ...      ...       ...   
875  1999000          3bds4ba1,851sqft- Townhouse for sale        3         4   
876  3888000              3bds3ba1,825sqft- Condo for sale        3         3   
877  7999900              3bds4ba4,348sqft- Condo for sale        3         4   
878  1628000  3bds4ba1,495sqft- Multi-family home for sale        3         4   
879  4198000              5bds7ba3,558sqft- House for sale        5         7   

      Size               Pr