In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np
import re

In [None]:

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
session = requests.Session()
session.headers.update(headers)

property_names = []
addresses = []
listing_prices = []
areas = []
furnish_status = []
descriptions = []

page = 1
total_properties = 0

while total_properties < 700 and page <= 50:
    url_pune = f"https://www.squareyards.com/rent/apartments-for-rent-in-pune?page={page}"
    url_mumbai = f"https://www.squareyards.com/rent/apartments-for-rent-in-mumbai?page={page}"

    response_pune = session.get(url_pune)
    response_mumbai = session.get(url_mumbai)

    soup_pune = BeautifulSoup(response_pune.content, 'html.parser')
    soup_mumbai = BeautifulSoup(response_mumbai.content, 'html.parser')

    property_cards = soup_pune.find_all('article', class_='listing-card') + \
                     soup_mumbai.find_all('article', class_='listing-card')

    if not property_cards:
        break

    for card in property_cards:
        name_elem = card.find('span', attrs={'class':'project-name'})
        if name_elem:
            property_names.append(name_elem.text.strip())
        else:
            property_names.append(None)
        
        # Address from heading link
        address_elem = card.find('h2', class_='heading')
        if address_elem :
            addresses.append(address_elem.text.strip())
        else:
            addresses.append(None)
        
        # Price from listing-price
        price_elem = card.find('p', class_='listing-price')
        if price_elem and price_elem.find('strong'):
            listing_prices.append(price_elem.find('strong').text.strip())
        else:
            listing_prices.append(None)
        
        # Area from unit-value avail-area
        area_elem = card.find('span', class_='unit-value avail-area')
        if area_elem:
            areas.append(area_elem.text.strip())
        else:
            areas.append(None)
        
        # Furnish status from icon classes
        furnish_info = card.find('ul', class_='listing-information')
        furnish_text = 'N/A'
        if furnish_info:
            # Find the li with furnish info (has em with icon-semi-furnished)
            furnish_li = furnish_info.find('li')
            if furnish_li:
                # Get the span text inside this li
                furnish_span = furnish_li.find('span')
                if furnish_span:
                    furnish_text = furnish_span.text.strip()
        furnish_status.append(furnish_text)
        
        # Description from description class
        desc_elem = card.find('div', class_='description')
        if desc_elem and desc_elem.find('p'):
            descriptions.append(desc_elem.find('p').text.strip())
        else:
            descriptions.append(None)
    
    total_properties += len(property_cards)
    page += 1
    time.sleep(2)



In [None]:
# Create DataFrame
data = {
    'PropertyName': property_names,
    'Address': addresses,
    'ListingPrice': listing_prices,
    'Area': areas,
    'FurnishStatus': furnish_status,
    'Description': descriptions
}

df = pd.DataFrame(data)
df = df.drop_duplicates()

print(f"Total properties scraped: {len(df)}")
df.to_csv('squareyards_properties.csv', index=False)
print("Data saved to squareyards_properties.csv")


In [None]:
df = pd.read_csv("squareyards_properties.csv")
df

In [None]:
regex = r'\d+'
df["BHK"] = df["Address"].apply(lambda x: re.findall(regex, x)[0] if re.findall(regex, x) else np.nan)
df.head(10)

In [None]:
# Extract locality text after 'in' and before comma
df['Location'] = df['Address'].str.extract(r'in\s+([A-Za-z\s]+?)(?:,|$)', expand=False).str.strip()

In [None]:
# Extract city (Pune or Mumbai) using regex
df['City'] = df['Address'].str.extract(r'(Pune|Mumbai)', flags=re.IGNORECASE)[0].str.title()

In [None]:
# Clean ListingPrice: remove ₹, commas, handle L/C, convert to numeric
df['ListingPrice'] = df['ListingPrice'].str.replace('₹', '', regex=False).str.replace(',', '', regex=False).str.strip()
df['ListingPrice'] = df['ListingPrice'].str.replace(r'([0-9.]+)\s*[lL]', lambda m: str(float(m.group(1)) * 100000), regex=True)
df['ListingPrice'] = df['ListingPrice'].str.replace(r'([0-9.]+)\s*[cC]', lambda m: str(float(m.group(1)) * 10000000), regex=True)
df['ListingPrice'] = pd.to_numeric(df['ListingPrice'], errors='coerce')

In [None]:
# Extract only numeric value for Area
df['Area'] = df['Area'].str.extract(r'(\d+)', expand=False)
df['Area'] = pd.to_numeric(df['Area'], errors='coerce')
df.head(10)

In [None]:
df.isna().sum()
df = df.dropna()
df = df.drop(columns=['Address'])
df.head(10)
df.info()

In [None]:

# Arrange columns in the specified order
df = df[['PropertyName', 'BHK', 'Location', 'Area', 'ListingPrice','FurnishStatus', 'City', 'Description']]
df.head(10)
df.to_csv("data.csv",index=False)
