# Data Extraction from PropertyPro Website

In [158]:
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

In [159]:
HEADER = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36"
}

def make_connection(url):
    """ 
    Make a request to HTTP URL 
    return BeautifulSoup Object
    """
    try:
        response = requests.get(url, headers=HEADER)
        # check the response status code
        if response.status_code == 200:
            soup_content = BeautifulSoup(response.content, "html.parser")
            return soup_content
        else:
            print(f"Resources Not Available! Status Code {response.status_code}")
    except Exception as e:
        print(f"An Error occurs. Message: {e}")


def scrape_single_page_property(property_content_divs):
    """
    scrape single page property 
    return a list of dictionary
    """
    single_page_property = []
    
    for property_content_div in property_content_divs:

        # check each att
        property_title_div = property_content_div.find('div', class_="pl-title").find("h3") 
        property_title = property_title_div.get_text() if property_title_div is not None else "N/A"
        
        property_price_div = property_content_div.find('div', class_="pl-price").find("h3")
        property_price = property_price_div.get_text() if property_price_div is not None else "N/A"

        property_address_div = property_content_div.find('div', class_="pl-title").find("p")
        property_address = property_address_div.get_text() if property_address_div is not None else "N/A"
        
        property_amenities_div = property_content_div.find('div', class_="pl-price").find("h6")
        property_amenities = property_amenities_div.get_text() if property_amenities_div is not None else "N/A"
        
        property_type_div = property_content_div.find('div', class_="pl-title").find("h6").find('a')
        property_type = property_type_div.get_text() if property_type_div is not None else "N/A"

        property_agent_div = property_content_div.find("div", class_="flex-grow-1 ms-2")
        property_agent = property_agent_div.get_text() if property_agent_div is not None else "N/A"
        
        property_link_div = property_content_div.find('div', class_="pl-title").find("h6").find('a')
        property_link = property_link_div.get("href") if property_link_div is not None else "N/A"
        
        property_date_div = property_content_div.find("p", class_="date-added")
        property_date = property_date_div.get_text() if property_date_div is not None else "N/A"

        single_property = {
            "title": property_title,
            "price": property_price,
            "address": property_address,
            "amenities": property_amenities,
            "type": property_type,
            "agent": property_agent,
            "link": property_link,
            "date": property_date
        }

        single_page_property.append(single_property)
    return single_page_property

**Attributes**
- **Property Title**: `property_content_div.find('div', class_="pl-title").find("h3").get_text()`
- **Property Price**: `property_content_div.find('div', class_="pl-price").find("h3").get_text()`
- **Property Address**: `property_content_div.find('div', class_="pl-title").find("p").get_text()`
- **Property Amenities**: `property_content_div.find('div', class_="pl-price").find("h6").get_text()`
- **Property Type**: `property_content_div.find('div', class_="pl-title").find("h6").find('a').get_text()`
- **Property Agent**: `property_content_div.find("div", class_="flex-grow-1 ms-2").get_text()`
- **Property Link**: `property_content_div.find('div', class_="pl-title").find("h6").find('a').get("href")`
- **Property Date**: `property_content_div.find("p", class_="date-added").get_text()`

In [160]:
# all properties: 230 page
all_property = {}

for page_num in range(1,3):

    # website URL
    URL = f"https://propertypro.ng/property-for-sale?page={page_num}"

    print(f"Scraping property data from page {page_num}")

    # make a connection
    soup_content = make_connection(URL)

    property_content_divs = soup_content.find_all("div", class_="property-listing-content")

    single_page_property = scrape_single_page_property(property_content_divs)
    print(f"Property Scraped successfully: {len(single_page_property)} Properties Retrieved")

    # sleep
    time.sleep(5)


    all_property.update({f"page {page_num}":single_page_property})


Scraping property data from page 1
Property Scraped successfully: 22 Properties Retrieved
Scraping property data from page 2
Property Scraped successfully: 22 Properties Retrieved


In [193]:
# empty dataframe
house_df = pd.DataFrame({"title": [], "price": [], "address": [], "amenities": [],
                   "type": [], "agent": [], "link": [], "date": []}) # empty dataframe


In [194]:
# concatenate all pages into a dataframe
for page_key in all_property.keys():
    
    house_df = pd.concat([house_df, pd.DataFrame(all_property[page_key])], ignore_index=True)
    

In [196]:
# save dataframe to CSV
house_df.to_csv("raw_housing_property.csv", index=False)