In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict
import json

### House Scraping

In [2]:
@dataclass
class PropertyListing:
    """Data class to store property listing information"""
    title: str
    price: Optional[str] = None
    location: Optional[str] = None
    description: Optional[str] = None
    bedrooms: Optional[str] = None
    bathrooms: Optional[str] = None
    floor_area: Optional[str] = None
    land_size: Optional[str] = None
    property_type: Optional[str] = None
    subcategories: Optional[str] = None
    listing_url: Optional[str] = None
    listing_id: Optional[str] = None
    geo_location: Optional[str] = None
    car_spaces: Optional[str] = None
    classification: Optional[str] = None
    subdivision_name: Optional[str] = None

class LamudiScraper:
    """A class to scrape property listings from Lamudi Philippines"""
    
    BASE_URL = "https://www.lamudi.com.ph"
    
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.lamudi.com.ph'
        }
        self.session.headers.update(self.headers)

    def _extract_info(self, element: BeautifulSoup, class_name: str) -> str:
        """Helper method to extract text from HTML elements"""
        try:
            info = element.find(class_=class_name)
            return info.get_text(strip=True) if info else "N/A"
        except AttributeError:
            return "N/A"

    def _parse_data_attributes(self, listing_div) -> Dict:
        """Extract information from data attributes"""
        try:
            data = {
                'price': listing_div.get('data-price', 'N/A'),
                'subcategories': listing_div.get('data-subcategories', '[]'),
                'bedrooms': listing_div.get('data-bedrooms', 'N/A'),
                'bathrooms': listing_div.get('data-bathrooms', 'N/A'),
                'building_size': listing_div.get('data-building_size', 'N/A'),
                'land_size': listing_div.get('data-land_size', 'N/A'),
                'sku': listing_div.get('data-sku', 'N/A'),
                'geo_point': listing_div.get('data-geo-point', '[]'),
                'car_spaces': listing_div.get('data-car_spaces', 'N/A'),
                'classification': listing_div.get('data-classification', 'N/A'),
                'subdivision_name': listing_div.get('data-subdivisionname', 'N/A')
            }
            
            try:
                data['subcategories'] = ', '.join(json.loads(data['subcategories']))
            except json.JSONDecodeError:
                data['subcategories'] = 'N/A'
                
            return data
        except Exception as e:
            print(f"Error parsing data attributes: {e}")
            return {}

    def scrape_listings(self, url: str) -> List[PropertyListing]:
        """Scrape property listings from a page"""
        try:
            print(f"Scraping URL: {url}")
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            listings = []
            listing_cells = soup.find_all('div', {'class': 'ListingCell-wrapper'})
            
            for cell in listing_cells:
                try:
                    listing_info_div = cell.find('div', {'class': 'ListingCell-AllInfo'})
                    if not listing_info_div:
                        continue
                        
                    data_attrs = self._parse_data_attributes(listing_info_div)
                    
                    title_elem = cell.find('h3', {'class': 'ListingCell-KeyInfo-title'})
                    title = title_elem.get_text(strip=True) if title_elem else "N/A"
                    
                    link_elem = cell.find('a', {'class': 'ListingCell-ListingLink'})
                    listing_url = f"{self.BASE_URL}{link_elem['href']}" if link_elem else None
                    
                    location = self._extract_info(cell, 'ListingCell-KeyInfo-address-text')
                    description = self._extract_info(cell, 'ListingCell-shortDescription')
                    
                    agent_info = cell.find('div', {'class': 'ListingDetail-agent-name'})
                    agent_name = agent_info.get_text(strip=True) if agent_info else "N/A"
                    
                    agency_info = cell.find('div', {'class': 'AgentInfoV2-agent-agency'})
                    agent_agency = agency_info.get_text(strip=True) if agency_info else "N/A"
                    
                    listings.append(PropertyListing(
                        title=title,
                        price=data_attrs.get('price', 'N/A'),
                        location=location,
                        description=description,
                        bedrooms=data_attrs.get('bedrooms', 'N/A'),
                        bathrooms=data_attrs.get('bathrooms', 'N/A'),
                        floor_area=data_attrs.get('building_size', 'N/A'),
                        land_size=data_attrs.get('land_size', 'N/A'),
                        listing_url=listing_url,
                        subcategories=data_attrs.get('subcategories', 'N/A'),
                        listing_id=data_attrs.get('sku', 'N/A'),
                        geo_location=data_attrs.get('geo_point', 'N/A'),
                        car_spaces=data_attrs.get('car_spaces', 'N/A'),
                        classification=data_attrs.get('classification', 'N/A'),
                        subdivision_name=data_attrs.get('subdivision_name', 'N/A')
                    ))
                    
                except Exception as e:
                    print(f"Error parsing listing: {e}")
                    continue
                
            return listings
            
        except requests.RequestException as e:
            print(f"Error fetching data: {e}")
            return []
        except Exception as e:
            print(f"Unexpected error: {e}")
            return []

    def save_to_csv(self, listings: List[PropertyListing], filename: Optional[str] = None):
        """Save the listings to a CSV file"""
        if not listings:
            print("No listings to save.")
            return
        
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"lamudi_listings_{timestamp}.csv"
        
        listings_data = [asdict(listing) for listing in listings]
        df = pd.DataFrame(listings_data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"Saved {len(listings)} listings to {filename}")

def main():
    """Example usage of the Lamudi Scraper"""
    scraper = LamudiScraper()
    url = "https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/"
    
    all_listings = []
    
    for page in range(1, 101):
        page_url = f"{url}?page={page}" if page > 1 else url
        print(f"Scraping page {page}...")
        
        listings = scraper.scrape_listings(page_url)
        if listings:
            all_listings.extend(listings)
            print(f"Found {len(listings)} listings on page {page}")
        else:
            print(f"No listings found on page {page}")
            break
            
        time.sleep(random.uniform(2, 4))
    
    if all_listings:
        scraper.save_to_csv(all_listings)
    
    print(f"\nTotal listings scraped: {len(all_listings)}")

if __name__ == "__main__":
    main()

Scraping page 1...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/
Found 30 listings on page 1
Scraping page 2...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=2
Found 30 listings on page 2
Scraping page 3...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=3
Found 30 listings on page 3
Scraping page 4...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=4
Found 30 listings on page 4
Scraping page 5...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=5
Found 30 listings on page 5
Scraping page 6...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=6
Found 30 listings on page 6
Scraping page 7...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=7
Found 30 listings on page 7
Scraping page 8...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?pa

Scraping page 63...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=63
Found 30 listings on page 63
Scraping page 64...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=64
Found 30 listings on page 64
Scraping page 65...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=65
Found 30 listings on page 65
Scraping page 66...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=66
Found 30 listings on page 66
Scraping page 67...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=67
Found 30 listings on page 67
Scraping page 68...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=68
Found 30 listings on page 68
Scraping page 69...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/?page=69
Found 30 listings on page 69
Scraping page 70...
Scraping URL: https://www.lamudi.com.ph/buy/metro

### Apartment Scraping

In [7]:
@dataclass
class PropertyListing:
    """Data class to store property listing information"""
    title: str
    price: Optional[str] = None
    location: Optional[str] = None
    description: Optional[str] = None
    bedrooms: Optional[str] = None
    bathrooms: Optional[str] = None
    subcategories: Optional[str] = None
    rooms_total: Optional[str] = None
    floor_area: Optional[str] = None
    land_size: Optional[str] = None
    listing_url: Optional[str] = None
    listing_id: Optional[str] = None
    geo_location: Optional[str] = None
    car_spaces: Optional[str] = None
    classification: Optional[str] = None
    subdivision_name: Optional[str] = None

class LamudiScraper:
    """A class to scrape property listings from Lamudi Philippines"""
    
    BASE_URL = "https://www.lamudi.com.ph"
    
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.lamudi.com.ph'
        }
        self.session.headers.update(self.headers)

    def _extract_info(self, element: BeautifulSoup, class_name: str) -> str:
        """Helper method to extract text from HTML elements"""
        try:
            info = element.find(class_=class_name)
            return info.get_text(strip=True) if info else "N/A"
        except AttributeError:
            return "N/A"

    def _parse_data_attributes(self, listing_div) -> Dict:
        """Extract information from data attributes"""
        try:
            data = {
                'price': listing_div.get('data-price', 'N/A'),
                'bedrooms': listing_div.get('data-bedrooms', 'N/A'),
                'bathrooms': listing_div.get('data-bathrooms', 'N/A'),
                'rooms_total': listing_div.get('data-rooms_total', 'N/A'),
                'building_size': listing_div.get('data-building_size', 'N/A'),
                'land_size': listing_div.get('data-land_size', 'N/A'),
                'sku': listing_div.get('data-sku', 'N/A'),
                'geo_point': listing_div.get('data-geo-point', '[]'),
                'car_spaces': listing_div.get('data-car_spaces', 'N/A'),
                'subcategories': listing_div.get('data-subcategories', '[]'),
                'classification': listing_div.get('data-classification', 'N/A'),
                'subdivision_name': listing_div.get('data-subdivisionname', 'N/A')
            }
            
            try:
                data['subcategories'] = ', '.join(json.loads(data['subcategories']))
            except json.JSONDecodeError:
                data['subcategories'] = 'N/A'
                
            return data
        except Exception as e:
            print(f"Error parsing data attributes: {e}")
            return {}

    def scrape_listings(self, url: str) -> List[PropertyListing]:
        """Scrape property listings from a page"""
        try:
            print(f"Scraping URL: {url}")
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            listings = []
            listing_cells = soup.find_all('div', {'class': 'ListingCell-wrapper'})
            
            for cell in listing_cells:
                try:
                    listing_info_div = cell.find('div', {'class': 'ListingCell-AllInfo'})
                    if not listing_info_div:
                        continue
                        
                    data_attrs = self._parse_data_attributes(listing_info_div)
                    
                    title_elem = cell.find('h3', {'class': 'ListingCell-KeyInfo-title'})
                    title = title_elem.get_text(strip=True) if title_elem else "N/A"
                    
                    link_elem = cell.find('a', {'class': 'ListingCell-ListingLink'})
                    listing_url = f"{self.BASE_URL}{link_elem['href']}" if link_elem else None
                    
                    location = self._extract_info(cell, 'ListingCell-KeyInfo-address-text')
                    description = self._extract_info(cell, 'ListingCell-shortDescription')
                    
                    agent_info = cell.find('div', {'class': 'ListingDetail-agent-name'})
                    agent_name = agent_info.get_text(strip=True) if agent_info else "N/A"
                    
                    agency_info = cell.find('div', {'class': 'AgentInfoV2-agent-agency'})
                    agent_agency = agency_info.get_text(strip=True) if agency_info else "N/A"
                    
                    listings.append(PropertyListing(
                        title=title,
                        price=data_attrs.get('price', 'N/A'),
                        location=location,
                        description=description,
                        bedrooms=data_attrs.get('bedrooms', 'N/A'),
                        bathrooms=data_attrs.get('bathrooms', 'N/A'),
                        rooms_total=data_attrs.get('rooms_total', 'N/A'),
                        floor_area=data_attrs.get('building_size', 'N/A'),
                        land_size=data_attrs.get('land_size', 'N/A'),
                        listing_url=listing_url,
                        listing_id=data_attrs.get('sku', 'N/A'),
                        subcategories=data_attrs.get('subcategories', 'N/A'),
                        geo_location=data_attrs.get('geo_point', 'N/A'),
                        car_spaces=data_attrs.get('car_spaces', 'N/A'),
                        classification=data_attrs.get('classification', 'N/A'),
                        subdivision_name=data_attrs.get('subdivision_name', 'N/A')
                    ))
                    
                except Exception as e:
                    print(f"Error parsing listing: {e}")
                    continue
                
            return listings
            
        except requests.RequestException as e:
            print(f"Error fetching data: {e}")
            return []
        except Exception as e:
            print(f"Unexpected error: {e}")
            return []

    def save_to_csv(self, listings: List[PropertyListing], filename: Optional[str] = None):
        """Save the listings to a CSV file"""
        if not listings:
            print("No listings to save.")
            return
        
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"lamudi_listings_{timestamp}.csv"
        
        listings_data = [asdict(listing) for listing in listings]
        df = pd.DataFrame(listings_data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"Saved {len(listings)} listings to {filename}")

def main():
    """Example usage of the Lamudi Scraper"""
    scraper = LamudiScraper()
    url = "https://www.lamudi.com.ph/buy/metro-manila/quezon-city/apartment/"
    
    all_listings = []
    
    for page in range(1, 4):
        page_url = f"{url}?page={page}" if page > 1 else url
        print(f"Scraping page {page}...")
        
        listings = scraper.scrape_listings(page_url)
        if listings:
            all_listings.extend(listings)
            print(f"Found {len(listings)} listings on page {page}")
        else:
            print(f"No listings found on page {page}")
            break
            
        time.sleep(random.uniform(2, 4))
    
    if all_listings:
        scraper.save_to_csv(all_listings)
        
    
    print(f"\nTotal listings scraped: {len(all_listings)}")

if __name__ == "__main__":
    main()

Scraping page 1...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/apartment/
Found 30 listings on page 1
Scraping page 2...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/apartment/?page=2
Found 30 listings on page 2
Scraping page 3...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/apartment/?page=3
Found 12 listings on page 3
Saved 72 listings to lamudi_listings_20241120_064959.csv

Total listings scraped: 72


### Condo Scraping

In [4]:
@dataclass
class PropertyListing:
    """Data class to store property listing information"""
    title: str
    price: Optional[str] = None
    location: Optional[str] = None
    description: Optional[str] = None
    bedrooms: Optional[str] = None
    bathrooms: Optional[str] = None
    rooms_total: Optional[str] = None
    floor_area: Optional[str] = None
    land_size: Optional[str] = None
    listing_url: Optional[str] = None
    listing_id: Optional[str] = None
    geo_location: Optional[str] = None
    car_spaces: Optional[str] = None
    classification: Optional[str] = None
    condominium_name: Optional[str] = None 

class LamudiScraper:
    """A class to scrape property listings from Lamudi Philippines"""
    
    BASE_URL = "https://www.lamudi.com.ph"
    
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.lamudi.com.ph'
        }
        self.session.headers.update(self.headers)

    def _extract_info(self, element: BeautifulSoup, class_name: str) -> str:
        """Helper method to extract text from HTML elements"""
        try:
            info = element.find(class_=class_name)
            return info.get_text(strip=True) if info else "N/A"
        except AttributeError:
            return "N/A"

    def _parse_data_attributes(self, listing_div) -> Dict:
        """Extract information from data attributes"""
        try:
            data = {
                'price': listing_div.get('data-price', 'N/A'),
                'category': listing_div.get('data-category', 'N/A'),
                'subcategories': listing_div.get('data-subcategories', '[]'),
                'bedrooms': listing_div.get('data-bedrooms', 'N/A'),
                'bathrooms': listing_div.get('data-bathrooms', 'N/A'),
                'building_size': listing_div.get('data-building_size', 'N/A'),
                'sku': listing_div.get('data-sku', 'N/A'),
                'geo_point': listing_div.get('data-geo-point', '[]'),
                'furnished': listing_div.get('data-furnished', 'N/A'),
                'classification': listing_div.get('data-classification', 'N/A'),
                'condominium_name': listing_div.get('data-condominiumname', 'N/A')
            }
            
            try:
                data['subcategories'] = ', '.join(json.loads(data['subcategories']))
            except json.JSONDecodeError:
                data['subcategories'] = 'N/A'
                
            return data
        except Exception as e:
            print(f"Error parsing data attributes: {e}")
            return {}

    def scrape_listings(self, url: str) -> List[PropertyListing]:
        """Scrape property listings from a page"""
        try:
            print(f"Scraping URL: {url}")
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            listings = []
            listing_cells = soup.find_all('div', {'class': 'ListingCell-wrapper'})
            
            for cell in listing_cells:
                try:
                    listing_info_div = cell.find('div', {'class': 'ListingUnit'})
                    if not listing_info_div:
                        continue
                        
                    data_attrs = self._parse_data_attributes(listing_info_div)
                    
                    # Find the listing link and title
                    link_elem = cell.find('a', {'class': 'ListingCell-ListingLink'})
                    title = link_elem.get('title', 'N/A') if link_elem else 'N/A'
                    listing_url = f"{self.BASE_URL}{link_elem['href']}" if link_elem else None
                    
                    # Extract location from title or separate element if available
                    location = self._extract_info(cell, 'ListingCell-KeyInfo-address-text')
                    if location == 'N/A' and title != 'N/A':
                        # Try to extract location from title
                        location = title.split(' in ')[-1] if ' in ' in title else 'N/A'
                    
                    description = self._extract_info(cell, 'ListingCell-shortDescription')
                    
                    listings.append(PropertyListing(
                        title=title,
                        price=data_attrs.get('price', 'N/A'),
                        location=location,
                        description=description,
                        bedrooms=data_attrs.get('bedrooms', 'N/A'),
                        bathrooms=data_attrs.get('bathrooms', 'N/A'),
                        floor_area=data_attrs.get('building_size', 'N/A'),
                        listing_url=listing_url,
                        listing_id=data_attrs.get('sku', 'N/A'),
                        geo_location=data_attrs.get('geo_point', 'N/A'),
                        classification=data_attrs.get('classification', 'N/A'),
                        condominium_name=data_attrs.get('condominium_name', 'N/A')
                    ))
                    
                except Exception as e:
                    print(f"Error parsing listing: {e}")
                    continue
                
            return listings
            
        except requests.RequestException as e:
            print(f"Error fetching data: {e}")
            return []
        except Exception as e:
            print(f"Unexpected error: {e}")
            return []

    def save_to_csv(self, listings: List[PropertyListing], filename: Optional[str] = None):
        """Save the listings to a CSV file"""
        if not listings:
            print("No listings to save.")
            return
        
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"lamudi_listings_{timestamp}.csv"
        
        listings_data = [asdict(listing) for listing in listings]
        df = pd.DataFrame(listings_data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"Saved {len(listings)} listings to {filename}")

def main():
    """Example usage of the Lamudi Scraper"""
    scraper = LamudiScraper()
    url = "https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/"
    
    all_listings = []
    
    for page in range(1, 101):
        page_url = f"{url}?page={page}" if page > 1 else url
        print(f"Scraping page {page}...")
        
        listings = scraper.scrape_listings(page_url)
        if listings:
            all_listings.extend(listings)
            print(f"Found {len(listings)} listings on page {page}")
        else:
            print(f"No listings found on page {page}")
            break
            
        time.sleep(random.uniform(2, 4))
    
    if all_listings:
        scraper.save_to_csv(all_listings)

    
    print(f"\nTotal listings scraped: {len(all_listings)}")

if __name__ == "__main__":
    main()

Scraping page 1...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/
Found 30 listings on page 1
Scraping page 2...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=2
Found 30 listings on page 2
Scraping page 3...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=3
Found 30 listings on page 3
Scraping page 4...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=4
Found 30 listings on page 4
Scraping page 5...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=5
Found 30 listings on page 5
Scraping page 6...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=6
Found 30 listings on page 6
Scraping page 7...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=7
Found 30 listings on page 7
Scraping page 8...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?pa

Scraping page 63...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=63
Found 30 listings on page 63
Scraping page 64...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=64
Found 30 listings on page 64
Scraping page 65...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=65
Found 30 listings on page 65
Scraping page 66...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=66
Found 30 listings on page 66
Scraping page 67...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=67
Found 30 listings on page 67
Scraping page 68...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=68
Found 30 listings on page 68
Scraping page 69...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/condo/?page=69
Found 30 listings on page 69
Scraping page 70...
Scraping URL: https://www.lamudi.com.ph/buy/metro

### Commercial Scraping

In [5]:
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional
import requests
from bs4 import BeautifulSoup
import json
import pandas as pd
from datetime import datetime
import time
import random

@dataclass
class PropertyListing:
    """Data class to store property listing information"""
    title: str
    price: Optional[str] = None
    location: Optional[str] = None
    description: Optional[str] = None
    bedrooms: Optional[str] = None
    bathrooms: Optional[str] = None
    floor_area: Optional[str] = None
    property_type: Optional[str] = None
    subcategories: Optional[str] = None
    listing_url: Optional[str] = None
    listing_id: Optional[str] = None
    geo_location: Optional[str] = None
    furnished_status: Optional[str] = None
    condominium_name: Optional[str] = None
    is_new_development: Optional[bool] = None

class LamudiScraper:
    """A class to scrape property listings from Lamudi Philippines"""
    
    BASE_URL = "https://www.lamudi.com.ph"
    
    def __init__(self):
        self.session = requests.Session()
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.lamudi.com.ph'
        }
        self.session.headers.update(self.headers)

    def _extract_info(self, element: BeautifulSoup, class_name: str) -> str:
        """Helper method to extract text from HTML elements"""
        try:
            info = element.find(class_=class_name)
            return info.get_text(strip=True) if info else "N/A"
        except AttributeError:
            return "N/A"

    def _parse_data_attributes(self, listing_div) -> Dict:
        """Extract information from data attributes based on the new HTML structure"""
        try:
            data = {
                'price': listing_div.get('data-price', 'N/A'),
                'category': listing_div.get('data-category', 'N/A'),
                'subcategories': listing_div.get('data-subcategories', '[]'),
                'classification': listing_div.get('data-classification', 'N/A'),
                'furnished': listing_div.get('data-furnished', 'N/A'),
                'bedrooms': listing_div.get('data-bedrooms', 'N/A'),
                'bathrooms': listing_div.get('data-bathrooms', 'N/A'),
                'building_size': listing_div.get('data-building_size', 'N/A'),
                'condominiumname': listing_div.get('data-condominiumname', 'N/A'),
                'sku': listing_div.get('data-sku', 'N/A'),
                'geo_point': listing_div.get('data-geo-point', '[]'),
                'is_new_development': listing_div.get('data-listing-new-development', 'false')
            }
            
            # Parse subcategories JSON array
            try:
                data['subcategories'] = ', '.join(json.loads(data['subcategories']))
            except json.JSONDecodeError:
                data['subcategories'] = 'N/A'
            
            # Parse geo_point JSON array
            try:
                geo_point = json.loads(data['geo_point'])
                data['geo_point'] = f"{geo_point[0]}, {geo_point[1]}"
            except json.JSONDecodeError:
                data['geo_point'] = 'N/A'
            
            # Convert is_new_development to boolean
            data['is_new_development'] = data['is_new_development'].lower() == 'true'
                
            return data
        except Exception as e:
            print(f"Error parsing data attributes: {e}")
            return {}

    def scrape_listings(self, url: str) -> List[PropertyListing]:
        """Scrape property listings from a page"""
        try:
            print(f"Scraping URL: {url}")
            response = self.session.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            listings = []
            listing_cells = soup.find_all('div', {'class': 'ListingCell-wrapper'})
            
            for cell in listing_cells:
                try:
                    listing_info_div = cell.find('div', {'class': 'ListingCell-AllInfo'})
                    if not listing_info_div:
                        continue
                        
                    data_attrs = self._parse_data_attributes(listing_info_div)
                    
                    # Extract image information
                    img_elem = cell.find('img')
                    title = img_elem.get('alt', 'N/A') if img_elem else "N/A"
                    
                    # Extract listing URL
                    link_elem = cell.find('a', {'class': 'ListingCell-ListingLink'})
                    listing_url = f"{self.BASE_URL}{link_elem['href']}" if link_elem else None
                    
                    # Extract other information
                    location = self._extract_info(cell, 'ListingCell-KeyInfo-address-text')
                    description = self._extract_info(cell, 'ListingCell-shortDescription')
                    
                    listings.append(PropertyListing(
                        title=title,
                        price=data_attrs.get('price', 'N/A'),
                        location=location,
                        description=description,
                        bedrooms=data_attrs.get('bedrooms', 'N/A'),
                        bathrooms=data_attrs.get('bathrooms', 'N/A'),
                        floor_area=data_attrs.get('building_size', 'N/A'),
                        property_type=data_attrs.get('category', 'N/A'),
                        subcategories=data_attrs.get('subcategories', 'N/A'),
                        listing_url=listing_url,
                        listing_id=data_attrs.get('sku', 'N/A'),
                        geo_location=data_attrs.get('geo_point', 'N/A'),
                        furnished_status=data_attrs.get('furnished', 'N/A'),
                        condominium_name=data_attrs.get('condominiumname', 'N/A'),
                        is_new_development=data_attrs.get('is_new_development', False)
                    ))
                    
                except Exception as e:
                    print(f"Error parsing listing: {e}")
                    continue
                
            return listings
            
        except requests.RequestException as e:
            print(f"Error fetching data: {e}")
            return []
        except Exception as e:
            print(f"Unexpected error: {e}")
            return []

    def save_to_csv(self, listings: List[PropertyListing], filename: Optional[str] = None):
        """Save the listings to a CSV file"""
        if not listings:
            print("No listings to save.")
            return
        
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"lamudi_listings_{timestamp}.csv"
        
        listings_data = [asdict(listing) for listing in listings]
        df = pd.DataFrame(listings_data)
        df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"Saved {len(listings)} listings to {filename}")

def main():
    """Example usage of the Lamudi Scraper"""
    scraper = LamudiScraper()
    url = "https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/"
    
    all_listings = []
    
    for page in range(1, 50):
        page_url = f"{url}?page={page}" if page > 1 else url
        print(f"Scraping page {page}...")
        
        listings = scraper.scrape_listings(page_url)
        if listings:
            all_listings.extend(listings)
            print(f"Found {len(listings)} listings on page {page}")
        else:
            print(f"No listings found on page {page}")
            break
            
        time.sleep(random.uniform(2, 4))  # Polite delay between requests
    
    if all_listings:
        scraper.save_to_csv(all_listings)
    
    print(f"\nTotal listings scraped: {len(all_listings)}")

if __name__ == "__main__":
    main()

Scraping page 1...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/
Found 30 listings on page 1
Scraping page 2...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/?page=2
Found 30 listings on page 2
Scraping page 3...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/?page=3
Found 30 listings on page 3
Scraping page 4...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/?page=4
Found 30 listings on page 4
Scraping page 5...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/?page=5
Found 30 listings on page 5
Scraping page 6...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/?page=6
Found 30 listings on page 6
Scraping page 7...
Scraping URL: https://www.lamudi.com.ph/buy/metro-manila/quezon-city/commercial/?page=7
Found 30 listings on page 7
Scraping page 8...
Scraping URL: https://www.lamudi.com.ph/buy