### Web Scraping and building pandas df

In [1]:
import requests
from bs4 import BeautifulSoup
from home_classes import Home, HomeCollection


In [2]:
BASE_URL = "https://www.olx.uz"
FILTERED_PAGES_BASE_URL = "https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE"

In [3]:
# get htm content from url
def get_html(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad responses
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None
    


In [4]:
# create soup from html content
def create_soup(html_content):
    if html_content:
        return BeautifulSoup(html_content, 'html.parser')
    else:
        print("No HTML content to parse.")
        return None



In [5]:
# find number of pages and page URLs
def get_pages_urls_as_list(filtered_page_html_content):

    soup = create_soup(filtered_page_html_content)
    # Get pagination block
    pagination = soup.find("ul", class_="pagination-list")

    # Extract all valid <a> tags with numeric page text
    page_links = pagination.find_all("a", href=True)
    page_numbers = []

    for link in page_links:
        text = link.text.strip()
        if text.isdigit():
            page_numbers.append((int(text), link["href"]))

    # Get max page number
    max_page = max(num for num, _ in page_numbers)

    # Build full URLs for all pages (1 to max_page)
    page_urls = [f"{FILTERED_PAGES_BASE_URL}&page={i}" if i > 1 else f"{FILTERED_PAGES_BASE_URL}" for i in range(1, max_page + 1)]

    return page_urls



In [6]:
# list homes add link information from page
def get_urls_of_home_ads_in_page(page_url):
    ad_links = []
    soup = create_soup(get_html(page_url))
    if  soup:
        # Find all home listings
        home_cards = soup.find_all("div", class_="css-l9drzq")
        for card in home_cards:
            link_tag = card.find("a", class_="css-1tqlkj0")
            if link_tag:
                relative_url = link_tag["href"]
                full_url = relative_url if relative_url.startswith("http") else BASE_URL + relative_url
                ad_links.append(full_url)
    return ad_links

In [7]:
# extract home information from html content and build Home obects then add them to HomeCollection

import re

def parse_home_details(soup, home_ad_link=None):
    def extract_number(text):
    # Remove spaces (used as thousands separators) and any non-digit characters except .
        clean_text = text.replace(" ", "").replace(",", ".")
        match = re.search(r"\d+(\.\d+)?", clean_text)
        return float(match.group()) if match else None

    # 1. Price
    price = None
    price_block = soup.find("div", {"data-testid": "ad-price-container"})
    if price_block:
        price_text = price_block.get_text(strip=True)
        price = extract_number(price_text)

    # 2. Address
    address = None
    address_block = soup.find("p", class_="css-7wnksb")
    if address_block:
        raw_address = address_block.get_text(strip=True)
        # Remove "Toshkent" and "tumani", clean up commas and spaces
        address = raw_address.replace("Toshkent", "").replace("tumani", "").strip(", ").strip()


    # 3. Parameters block
    param_block = soup.find("div", {"data-testid": "ad-parameters-container"})

    data = {
        "number_of_rooms": None,
        "area": None,
        "living_area": None,
        "kitchen_area": None,
        "floor": None,
        "total_floors": None,
        "built_year": None,
        "bathroom": None,
        "furnishing_status": None,
        "status": "new",
        "price": price,
        "address": address,
        "with_makler": False  # default unless detected
    }

    if not param_block:
        return None  # if no structured info, skip

    for p in param_block.find_all("p", class_="css-1los5bp"):
        text = p.get_text(strip=True)

        if "Xonalar soni" in text:
            data["number_of_rooms"] = int(re.search(r"\d+", text).group())
        elif "Umumiy maydon" in text:
            data["area"] = extract_number(text)
        elif "Yashash maydoni" in text:
            data["living_area"] = extract_number(text)
        elif "Oshxona maydoni" in text:
            data["kitchen_area"] = extract_number(text)
        elif "Qavati" in text and "Uy qavatliligi" not in text:
            data["floor"] = int(re.search(r"\d+", text).group())
        elif "Uy qavatliligi" in text:
            data["total_floors"] = int(re.search(r"\d+", text).group())
        elif "Uy qurilgan" in text:
            match = re.search(r"\d{4}", text)
            if match:
                data["built_year"] = int(match.group())
        elif "Sanuzel" in text:
            data["bathroom"] = "private" if "Alohida" in text else "shared"
        elif "Mebelli" in text:
            data["furnishing_status"] = "furnished" if "Ha" in text else "unfurnished"
        elif "Turarjoy turi" in text:
            data["status"] = "new" if "Yangi" in text else "old"
        elif "Vositachilik haqqi: Bor" in text:
            data["with_makler"] = True
        elif "Vositachilik haqqi: Yoʻq" in text:
            data["with_makler"] = False

    # Fallback estimates
    if data["living_area"] is None and data["area"] is not None:
        data["living_area"] = round(data["area"] * 0.8, 2)
    if data["kitchen_area"] is None and data["area"] is not None:
        data["kitchen_area"] = round(data["area"] * 0.2, 2)


    # Fix swapped values if needed
    if (
        data["floor"] is not None
        and data["total_floors"] is not None
        and data["floor"] > data["total_floors"]
    ):
        data["floor"], data["total_floors"] = data["total_floors"], data["floor"]

    try:
        home = Home(
            price=data["price"],
            area=data["area"],
            living_area=data["living_area"],
            kitchen_area=data["kitchen_area"],
            number_of_rooms=data["number_of_rooms"],
            status=data["status"],
            furnishing_status=data["furnishing_status"],
            bathroom=data["bathroom"],
            floor=data["floor"],
            total_floors=data["total_floors"],
            built_year=data["built_year"],
            address=data["address"],
            with_makler=data["with_makler"],
            home_ad_link=home_ad_link
        )
        return home
    except Exception as e:
        print("❌ Failed to create Home:", e)
        return None

In [8]:
home_collection = HomeCollection()

ads_page_urls = get_pages_urls_as_list(get_html(FILTERED_PAGES_BASE_URL))

for index, ad_page_url in enumerate(ads_page_urls):
    print(f'processing page {index + 1}/{len(ads_page_urls)}: {ad_page_url}')

    links_of_ads_in_page = get_urls_of_home_ads_in_page(ad_page_url)

    for ad_link in links_of_ads_in_page:
        ad_soup = create_soup(get_html(ad_link))
        if not ad_soup:
            continue  # Skip to the next ad if soup creation failed

        home = parse_home_details(ad_soup, home_ad_link=ad_link)
        if home:
            home_collection.add_home(home)


home_df = home_collection.get_all_homes()
home_df.to_csv('uzb_housing.csv', index=False, encoding='utf-8-sig')

processing page 1/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE
processing page 2/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE&page=2
processing page 3/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE&page=3
processing page 4/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE&page=4
processing page 5/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE&page=5
processing page 6/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE&page=6
processing page 7/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE&page=7
processing page 8/25: https://www.olx.uz/oz/nedvizhimost/kvartiry/prodazha/q-toshkent-uylar-narxi/?currency=UYE&page=8
processing page 9/25: https://www.olx.uz/oz/nedvizhimos