In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

# Parameters
base_url = "https://www.magicbricks.com/property-for-rent/residential-real-estate?cityName=Bangalore&page={}"
headers = {
    'User-Agent': 'Mozilla/5.0'
}

# Storage
data = []
page = 1
max_records = 3500

while len(data) < max_records:
    print(f"Scraping page {page}...")
    url = base_url.format(page)
    response = requests.get(url, headers=headers)
    
    if response.status_code != 200:
        print(f"Failed to fetch page {page}. Status code: {response.status_code}")
        break
    
    soup = BeautifulSoup(response.content, 'lxml')
    property_cards = soup.find_all('div', class_='mb-srp__list')

    if not property_cards:
        print("No more listings found.")
        break

    for card in property_cards:
        try:
            title_tag = card.find('h2')
            title = title_tag.get_text(strip=True) if title_tag else None

            location_tag = card.find('div', class_='mb-srp__card--address')
            location = location_tag.get_text(strip=True) if location_tag else None

            price_tag = card.find('div', class_='mb-srp__card__price--amount')
            price = price_tag.get_text(strip=True) if price_tag else None

            area = None
            area_container = card.find('div', class_='mb-srp__card__summary__list')
            if area_container:
                for div in area_container.find_all('div'):
                    if 'sqft' in div.text.lower():
                        area = div.text.strip()
                        break

            furnish = None
            summary_vals = card.find_all('div', class_='mb-srp__card__summary--value')
            if summary_vals and len(summary_vals) > 1:
                furnish = summary_vals[1].text.strip()

            bathrooms = None
            for div in card.find_all("div"):
                text = div.get_text(strip=True)
                if "Bath" in text:
                    match = re.search(r'(\d+)\s*Bath', text, re.IGNORECASE)
                    if match:
                        bathrooms = int(match.group(1))
                        break
                    match = re.search(r'Bath.*?(\d+)', text, re.IGNORECASE)
                    if match:
                        bathrooms = int(match.group(1))
                        break

            data.append({
                'Title': title,
                'Location': location,
                'Rent': price,
                'Area': area,
                'Furnishing': furnish,
                'Bathrooms': bathrooms
            })

            if len(data) >= max_records:
                break

        except Exception as e:
            print(f"Error while parsing listing: {e}")
            continue

    page += 1
    time.sleep(2)  # Be nice to the server

# Convert to DataFrame
df = pd.DataFrame(data)
print(f"Total listings collected: {len(df)}")

# Save to CSV
df.to_csv('../data/bangalore_rent_data_advanced.csv', index=False)
print("Saved to bangalore_rent_data_advanced.csv")


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 