In [8]:
import requests
from bs4 import BeautifulSoup
import json
import re
import pandas as pd
from typing import Dict, List
import time
from urllib.parse import urljoin, urlparse, parse_qs, urlunparse

class LamudiScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        self.base_url = "https://www.lamudi.com.ph"
        self.features_set = set()
        
    def get_listing_page(self, url: str) -> BeautifulSoup:
        response = requests.get(url, headers=self.headers)
        return BeautifulSoup(response.text, 'html.parser')
    
    def generate_page_url(self, base_url: str, page: int) -> str:
        parsed_url = urlparse(base_url)
        path = parsed_url.path.rstrip('/')  # Remove trailing slash
        new_path = f"{path}/page-{page}/"
        return urlunparse((parsed_url.scheme, parsed_url.netloc, new_path, 
                         parsed_url.params, parsed_url.query, parsed_url.fragment))
    
    def extract_listings_data(self, soup: BeautifulSoup, page_num: int) -> List[Dict]:
        listings = []
        listing_cells = soup.find_all('div', class_='ListingCell-content')
        total_listings = len(listing_cells)
        
        for idx, cell in enumerate(listing_cells, 1):
            listing_data = {}
            
            # Extract basic information
            title_elem = cell.find('h3', class_='ListingCell-KeyInfo-title')
            if not title_elem:  # Skip if no title found (likely invalid listing)
                continue
                
            listing_data['title'] = title_elem.text.strip()
            print(f"\nPage {page_num} - Property {idx}/{total_listings}")
            print(f"Title: {listing_data['title'][:100]}...")
            
            listing_info = cell.find('div', class_='ListingCell-AllInfo')
            if not listing_info:
                continue
                
            listing_data['listing_id'] = listing_info.get('data-sku', '')
            
            # Extract property details from ListingCell-AllInfo
            listing_data['bedrooms'] = self.extract_number_value(listing_info.get('data-bedrooms', ''))
            listing_data['bathrooms'] = self.extract_number_value(listing_info.get('data-bathrooms', ''))
            listing_data['floor_area'] = self.extract_number_value(listing_info.get('data-building_size', ''))
            listing_data['land_area'] = self.extract_number_value(listing_info.get('data-land_size', ''))
            
            # Extract coordinates from data-geo-point
            geo_point = listing_info.get('data-geo-point', '')
            if geo_point:
                try:
                    # Convert string "[longitude,latitude]" to list
                    coords = json.loads(geo_point)
                    listing_data['longitude'] = coords[0]
                    listing_data['latitude'] = coords[1]
                except (json.JSONDecodeError, IndexError):
                    pass
            
            link_elem = cell.find('a', class_='js-listing-link')
            if link_elem:
                listing_data['url'] = urljoin(self.base_url, link_elem['href'])
            
            # Extract price
            price_elem = cell.find('span', class_='PriceSection-FirstPrice')
            listing_data['price'] = price_elem.text.strip() if price_elem else None
            
            # Extract subdivision
            listing_data['subdivision'] = listing_info.get('data-subdivisionname', '')
            
            # Extract description
            desc_elem = cell.find('div', class_='ListingCell-shortDescription')
            listing_data['description'] = desc_elem.text.strip() if desc_elem else None
            
            # Get detailed data from property page
            print("Fetching detailed property information...")
            detailed_data = self.get_property_details(listing_data['url'])
            listing_data.update(detailed_data)
            print("Features found:", len(detailed_data.get('features', {})))
            
            listings.append(listing_data)
            time.sleep(1)  # Polite delay between requests
            
        return listings
    
    def get_property_details(self, url: str) -> Dict:
        soup = self.get_listing_page(url)
        details = {}
        
        # Find all script content
        page_content = str(soup)
        
        # Try to find property details using the exact patterns
        bathrooms_match = re.search(r'\"bathrooms\":\"(\d+)\"', page_content)
        bedrooms_match = re.search(r'\"bedrooms\":\"(\d+)\"', page_content)
        building_size_match = re.search(r'\"building_size\":(\d+)', page_content)
        land_size_match = re.search(r'\"land_size\":(\d+)', page_content)
        
        # Try to find coordinates using the exact pattern
        coords_match = re.search(r'GeoCoordinates\\\",\\\"latitude\\\":(\d+\.\d+),\\\"longitude\\\":(\d+\.\d+)', page_content)
        
        if bathrooms_match:
            details['bathrooms'] = int(bathrooms_match.group(1))
        if bedrooms_match:
            details['bedrooms'] = int(bedrooms_match.group(1))
        if building_size_match:
            details['floor_area'] = int(building_size_match.group(1))
        if land_size_match:
            details['land_area'] = int(land_size_match.group(1))
            
        if coords_match:
            details['latitude'] = float(coords_match.group(1))
            details['longitude'] = float(coords_match.group(2))
        
        # Extract features (simplified version without :1)
        features = []
        feature_items = soup.find_all('div', class_='FeatureGrid_featureItem__iwtDt')
        for item in feature_items:
            feature_text = item.find('p').text.strip()
            features.append(feature_text)
            self.features_set.add(feature_text)
            
        details['features'] = features
        
        print("Details found:", {k: v for k, v in details.items() if k != 'features'})
        
        return details
    
    def extract_number_value(self, value: str) -> int:
        if isinstance(value, (int, float)):
            return value
        match = re.search(r'\d+', str(value))
        return int(match.group()) if match else None
    
    def scrape_properties(self, url: str, num_pages: int = 1) -> pd.DataFrame:
        all_listings = []
        
        for page in range(1, num_pages + 1):
            print(f"\n{'='*50}")
            print(f"Scraping page {page} of {num_pages}")
            print(f"{'='*50}")
            page_url = self.generate_page_url(url, page) if page > 1 else url
            soup = self.get_listing_page(page_url)
            listings_data = self.extract_listings_data(soup, page)
            all_listings.extend(listings_data)
            
            # Add delay between pages
            if page < num_pages:
                time.sleep(2)
        
        # Convert to DataFrame
        df = pd.DataFrame(all_listings)
        
        # Create binary columns for features while keeping the original features dictionary
        for feature in self.features_set:
            df[f'has_{feature.lower().replace(" ", "_")}'] = df['features'].apply(
                lambda x: 1 if feature in x else 0 if isinstance(x, list) else 0
            )
        
        return df

# Usage example
if __name__ == "__main__":
    scraper = LamudiScraper()
    base_url = "https://www.lamudi.com.ph/buy/metro-manila/quezon-city/house/"
    
    # Scrape 5 pages (you can change this number)
    num_pages = 50
    df = scraper.scrape_properties(base_url, num_pages)
    
    # Save to CSV
    df.to_csv('lamudi_properties.csv', index=False)


Scraping page 1 of 50

Page 1 - Property 1/30
Title: Brand New House and Lot For Sale near New Manila QC and San Juan Greenhills...
Fetching detailed property information...
Details found: {}
Features found: 22

Page 1 - Property 2/30
Title: 7.4 m Brand New SINGLE House and LOT in NEW HAVEN Subdivision Qc - Jojo Obra...
Fetching detailed property information...
Details found: {}
Features found: 13

Page 1 - Property 3/30
Title: 5.1m Brand NEW 3 Bedrooms House and Lot Pre - Selling in North Olmpus Subd -JOJO...
Fetching detailed property information...
Details found: {}
Features found: 18

Page 1 - Property 4/30
Title: North Olympus Brand NEW House and LOT For Sale Pre - Selling - Jojo Obra...
Fetching detailed property information...
Details found: {}
Features found: 22

Page 1 - Property 5/30
Title: CORNER LOT 3 Bedrooms Brand NEW House and LOT For Sale in North Olympus Subd Qc...
Fetching detailed property information...
Details found: {}
Features found: 19

Page 1 - Property 6/30
T

In [23]:
import pandas as pd

df = pd.read_csv("lamudi_properties.csv")
df.head()

print(df.columns.tolist())

['title', 'listing_id', 'bedrooms', 'bathrooms', 'floor_area', 'land_area', 'longitude', 'latitude', 'url', 'price', 'subdivision', 'description', 'features', 'has_cctv', 'has_alarm_system', 'has_parks', 'has_courtyard', 'has_multi-purpose_lawn', 'has_secure_parking', 'has_fully_fenced', 'has_fire_alarm', 'has_tennis_court', 'has_remote_garage', 'has_function_area', 'has_wi-fi', 'has_attic', 'has_lanai', 'has_ensuite', 'has_volleyball_court', 'has_terrace', 'has_storage_room', 'has_sports_facilities', 'has_air_conditioning', 'has_basketball_court', 'has_study_room', "has_driver's_room", 'has_garden', 'has_playground', "has_maid's_room", 'has_helipad', 'has_jogging_path', 'has_smoke_detector', 'has_carport', 'has_open_space', 'has_badminton_court', 'has_built-in_wardrobes', 'has_bar', 'has_open_car_spaces', 'has_landscaped_garden', 'has_entertainment_room', 'has_parking_lot', 'has_powder_room', 'has_24-hour_security', 'has_clubhouse', 'has_maids_room', 'has_garage', 'has_shower_rooms', 

In [25]:
# Create a copy of the original dataframe
df_standardized = df.copy()

# Dictionary mapping for features that need to be combined or renamed
feature_mapping = {
    # Security/Safety features
    'Security': ['has_cctv', 'has_24-hour_security'],
    'Alarm': ['has_alarm_system'],
    
    # Outdoor/Recreational features
    'Tennis court': ['has_tennis_court'],
    'Swimming pool': ['has_swimming_pool'],
    'Balcony': ['has_balcony', 'has_lanai'],  # Added has_lanai
    'Garden': ['has_garden'],
    'Terrace': ['has_terrace'],
    "Children's area": ['has_playground'],
    
    # Storage/Utility features
    'Built-in wardrobe': ['has_built-in_wardrobes'],
    'Utility room': ['has_storage_room'],
    
    # Parking features
    'Car park': ['has_carport', 'has_garage', 'has_parking_lot', 'has_open_car_spaces', 'has_remote_garage'],  # Added has_remote_garage
    
    # Climate Control
    'Air conditioning': ['has_air_conditioning'],
    
    # Internet/Technology
    'Internet': ['has_wi-fi'],

    # Feature list
    'facilities' : ['features']
}

# Create new columns based on the mapping
for new_feature, old_features in feature_mapping.items():
    # Create the new column by combining the old ones using OR operation
    df_standardized[new_feature] = df_standardized[old_features].any(axis=1).astype(int)

# Get list of all original columns that were used in the mapping
columns_to_drop = []
for old_features in feature_mapping.values():
    columns_to_drop.extend(old_features)

# Drop the original columns
df_standardized = df_standardized.drop(columns=columns_to_drop)

# Print the remaining columns to verify
print("Remaining columns:", sorted(df_standardized.columns.tolist()))


Remaining columns: ['Air conditioning', 'Alarm', 'Balcony', 'Built-in wardrobe', 'Car park', "Children's area", 'Garden', 'Internet', 'Security', 'Swimming pool', 'Tennis court', 'Terrace', 'Utility room', 'bathrooms', 'bedrooms', 'description', 'facilities', 'floor_area', 'has_attic', 'has_badminton_court', 'has_bar', 'has_basketball_court', 'has_clubhouse', 'has_courtyard', "has_driver's_room", 'has_ensuite', 'has_entertainment_room', 'has_fire_alarm', 'has_fully_fenced', 'has_function_area', 'has_helipad', 'has_jogging_path', 'has_landscaped_garden', "has_maid's_room", 'has_maids_room', 'has_multi-purpose_lawn', 'has_open_space', 'has_parks', 'has_powder_room', 'has_secure_parking', 'has_shower_rooms', 'has_smoke_detector', 'has_sports_facilities', 'has_study_room', 'has_volleyball_court', 'land_area', 'latitude', 'listing_id', 'longitude', 'price', 'subdivision', 'title', 'url']


In [32]:
# Get list of columns that start with 'has_'
columns_to_drop = [col for col in df_standardized.columns if col.startswith('has_')]

# Drop these columns
df_standardized = df_standardized.drop(columns=columns_to_drop)

# Print remaining columns to verify
print("Remaining columns in df:", sorted(df_standardized.columns.tolist()))

Remaining columns in df: ['Air conditioning', 'Alarm', 'Balcony', 'Built-in wardrobe', 'Car park', "Children's area", 'Garden', 'Internet', 'Security', 'Swimming pool', 'Tennis court', 'Terrace', 'Utility room', 'bathrooms', 'bedrooms', 'description', 'facilities', 'floor_area', 'land_area', 'latitude', 'listing_id', 'longitude', 'price', 'subdivision', 'title', 'url']


In [47]:
dot = pd.read_csv("[dec 1] dot_house.csv")
print(dot.columns.tolist())

['title', 'location', 'price', 'price_per_sqm', 'url', 'listing_id', 'bedrooms', 'bathrooms', 'floor_area', 'land_area', 'floor', 'name', 'latitude', 'longitude', 'address', 'facilities', 'Access for the disabled', 'Air conditioning', 'Alarm', 'Balcony', 'Built-in kitchen', 'Built-in wardrobe', 'Car park', "Children's area", 'Cistern', 'Concierge', 'Electricity', 'Elevator', 'Equipped kitchen', 'Garden', 'Grill', 'Guardhouse', 'Gym', 'Heating', 'Hot Tub', 'Internet', 'Library', 'Natural gas', 'Office', 'Panoramic view', 'Patio', 'Roof garden', 'Sauna', 'Security', 'Swimming pool', 'Tennis court', 'Terrace', 'Utility room', 'Video cable', 'Water', 'Fireplace', 'Cellar']


In [48]:
# List of columns to remove
columns_to_remove = [
    'Access for the disabled',
    'Built-in kitchen',
    'Cistern',
    'Concierge',
    'Electricity',
    'Elevator',
    'Equipped kitchen',
    'Grill',
    'Guardhouse',
    'Gym',
    'Heating',
    'Hot Tub',
    'Library',
    'Natural gas',
    'Office',
    'Panoramic view',
    'Patio',
    'Roof garden',
    'Sauna',
    'Video cable',
    'Water',
    'Fireplace',
    'Cellar',
    'price_per_sqm'
]

# Drop these columns from dot
dot = dot.drop(columns=columns_to_remove)

# Print remaining columns to verify
print("Remaining columns in dot:", sorted(dot.columns.tolist()))

Remaining columns in dot: ['Air conditioning', 'Alarm', 'Balcony', 'Built-in wardrobe', 'Car park', "Children's area", 'Garden', 'Internet', 'Security', 'Swimming pool', 'Tennis court', 'Terrace', 'Utility room', 'address', 'bathrooms', 'bedrooms', 'facilities', 'floor', 'floor_area', 'land_area', 'latitude', 'listing_id', 'location', 'longitude', 'name', 'price', 'title', 'url']


In [51]:
dot.shape

(1250, 28)

In [36]:
# List of source DataFrames
dataframes = [df_standardized]
df_names = ["house"]

# Collect NA counts in each DataFrame
na_counts_data = []
for df, name in zip(dataframes, df_names):
    # Replace "na" text with NaN if needed
    df.replace("na", pd.NA, inplace=True)
    
    # Count NA values in each column
    na_counts = df.isna().sum()
    
    # Append results with the source DataFrame name
    for column, count in na_counts.items():
        na_counts_data.append([name, column, count])

# Create the summary DataFrame
na_counts_df = pd.DataFrame(na_counts_data, columns=["Listing Type", "Feature", "NA Count"])

na_counts_df

Unnamed: 0,Listing Type,Feature,NA Count
0,house,title,0
1,house,listing_id,0
2,house,bedrooms,0
3,house,bathrooms,0
4,house,floor_area,0
5,house,land_area,0
6,house,longitude,0
7,house,latitude,0
8,house,url,0
9,house,price,0


In [57]:
# List of source DataFrames
dataframes = [dot]
df_names = ["house"]

# Collect NA counts in each DataFrame
na_counts_data = []
for df, name in zip(dataframes, df_names):
    # Replace "na" text with NaN if needed
    df.replace("na", pd.NA, inplace=True)
    
    # Count NA values in each column
    na_counts = df.isna().sum()
    
    # Append results with the source DataFrame name
    for column, count in na_counts.items():
        na_counts_data.append([name, column, count])

# Create the summary DataFrame
na_counts_df = pd.DataFrame(na_counts_data, columns=["Listing Type", "Feature", "NA Count"])

na_counts_df

Unnamed: 0,Listing Type,Feature,NA Count
0,house,title,1058
1,house,location,0
2,house,price,0
3,house,url,0
4,house,listing_id,0
5,house,bedrooms,64
6,house,bathrooms,105
7,house,floor_area,0
8,house,land_area,64
9,house,floor,649


In [41]:
# First, let's check the data types of these columns
print("Data types of the columns:")
print(df_standardized[['price', 'floor_area']].dtypes)

# Convert to numeric without string operations since they might already be numeric
df_standardized['price_per_sqm'] = pd.to_numeric(df_standardized['price']) / pd.to_numeric(df_standardized['floor_area'])

# Round to 2 decimal places
df_standardized['price_per_sqm'] = df_standardized['price_per_sqm'].round(2)

df_standardized.head()

Data types of the columns:
price         float64
floor_area      int64
dtype: object


Unnamed: 0,title,listing_id,bedrooms,bathrooms,floor_area,land_area,longitude,latitude,url,price,...,Garden,Terrace,Children's area,Built-in wardrobe,Utility room,Car park,Air conditioning,Internet,facilities,price_per_sqm
0,Brand New House and Lot For Sale near New Mani...,HO66EE166652EAAPH,4,4,490,150,121.043543,14.613141,https://www.lamudi.com.ph/buy/metro-manila/que...,65000000.0,...,0,0,0,1,1,1,1,0,1,132653.06
1,7.4 m Brand New SINGLE House and LOT in NEW HA...,HO677CA06D0F95FPH,3,2,80,100,121.04835,14.733322,https://www.lamudi.com.ph/buy/metro-manila/que...,7400000.0,...,0,0,1,0,0,1,0,0,1,92500.0
2,5.1m Brand NEW 3 Bedrooms House and Lot Pre - ...,HO677CA7CB26D42PH,3,2,65,58,121.04835,14.733322,https://www.lamudi.com.ph/buy/metro-manila/que...,5100000.0,...,0,0,1,0,0,1,0,0,1,78461.54
3,North Olympus Brand NEW House and LOT For Sale...,HO677CA41AA5555PH,3,2,65,158,121.04835,14.733322,https://www.lamudi.com.ph/buy/metro-manila/que...,5100000.0,...,1,0,1,0,0,1,0,0,1,78461.54
4,CORNER LOT 3 Bedrooms Brand NEW House and LOT ...,HO677CA6A7554E4PH,3,2,70,64,121.04835,14.733322,https://www.lamudi.com.ph/buy/metro-manila/que...,5500000.0,...,0,0,1,0,0,1,0,0,1,78571.43


In [56]:
# Drop rows where floor_area is null
dot = dot.dropna(subset=['floor_area'])

In [61]:
# Clean price column by removing ₱ symbol and commas, then convert to float
dot['price'] = dot['price'].str.replace('₱', '').str.replace(',', '').astype(float)

# Now create the price_per_sqm column
dot['price_per_sqm'] = dot['price'] / dot['floor_area']

# Round to 2 decimal places if desired
dot['price_per_sqm'] = dot['price_per_sqm'].round(2)

In [63]:
dot.shape

(1058, 29)

In [69]:
df_standardized.shape

(1500, 27)

In [70]:
# Vertical concatenation
combined_df = pd.concat([dot, df], axis=0, ignore_index=True)

In [72]:
combined_df.head()

Unnamed: 0,title,location,price,url,listing_id,bedrooms,bathrooms,floor_area,land_area,floor,...,Car park,Children's area,Garden,Internet,Security,Swimming pool,Tennis court,Terrace,Utility room,price_per_sqm
0,,"Quezon City, Metro Manila",105000000.0,https://www.dotproperty.com.ph/ads/5-bedroom-h...,NFM01,5.0,5.0,427.62,500 m2,2.0,...,1,0,1,0,0,0,0,0,1,245545.11
1,,"Quezon City, Metro Manila",86000000.0,https://www.dotproperty.com.ph/ads/5-bedroom-h...,0d1129d7-e3d9-4b66-9244-f77184ed6b36,5.0,5.0,500.0,441 m2,,...,1,1,0,1,0,0,0,0,0,172000.0
2,,"Quezon City, Metro Manila",7600000.0,https://www.dotproperty.com.ph/ads/4-bedroom-h...,01936167-0bde-7cc2-9513-321ba07d2da8,4.0,3.0,105.0,35 m2,,...,1,0,1,0,0,1,0,0,0,72380.95
3,,"Quezon City, Metro Manila",38000000.0,https://www.dotproperty.com.ph/ads/4-bedroom-h...,bce0e44d-c353-44be-a11a-d50e8be35eed,4.0,4.0,400.0,314 m2,,...,0,0,0,0,1,0,0,0,0,95000.0
4,,"Quezon City, Metro Manila",92000000.0,https://www.dotproperty.com.ph/ads/5-bedroom-h...,3ba5f401-31bc-4c19-b00c-e0a7b060b907,5.0,5.0,421.0,400 m2,,...,1,0,0,0,1,1,0,0,0,218527.32


In [73]:
# List of source DataFrames
dataframes = [combined_df]
df_names = ["house"]

# Collect NA counts in each DataFrame
na_counts_data = []
for df, name in zip(dataframes, df_names):
    # Replace "na" text with NaN if needed
    df.replace("na", pd.NA, inplace=True)
    
    # Count NA values in each column
    na_counts = df.isna().sum()
    
    # Append results with the source DataFrame name
    for column, count in na_counts.items():
        na_counts_data.append([name, column, count])

# Create the summary DataFrame
na_counts_df = pd.DataFrame(na_counts_data, columns=["Listing Type", "Feature", "NA Count"])

na_counts_df

Unnamed: 0,Listing Type,Feature,NA Count
0,house,title,2116
1,house,location,0
2,house,price,0
3,house,url,0
4,house,listing_id,0
5,house,bedrooms,128
6,house,bathrooms,210
7,house,floor_area,0
8,house,land_area,128
9,house,floor,1298


In [74]:
combined_df.to_csv("[jan 10] house listings.csv")