In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from urllib.parse import urljoin, urlparse
import json
from typing import List, Dict, Optional
import logging

class TunisiaRealEstateScraper:
    def __init__(self, base_url: str, delay: float = 1.0):
        """
        Initialize the scraper
        
        Args:
            base_url: Base URL of the real estate website
            delay: Delay between requests in seconds (be respectful)
        """
        self.base_url = base_url
        self.delay = delay
        self.session = requests.Session()
        
        # Set headers to mimic a real browser
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
        })
        
        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
    
    def get_page(self, url: str) -> Optional[BeautifulSoup]:
        """
        Fetch and parse a webpage
        
        Args:
            url: URL to fetch
            
        Returns:
            BeautifulSoup object or None if failed
        """
        try:
            self.logger.info(f"Fetching: {url}")
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            
            # Add delay to be respectful
            time.sleep(self.delay)
            
            return BeautifulSoup(response.content, 'html.parser')
            
        except requests.RequestException as e:
            self.logger.error(f"Error fetching {url}: {e}")
            return None
    
    def extract_price(self, price_text: str) -> Optional[float]:
        """
        Extract numeric price from text
        
        Args:
            price_text: Text containing price
            
        Returns:
            Numeric price or None
        """
        if not price_text:
            return None
            
        # Remove common currency symbols and text
        price_text = re.sub(r'[^\d.,]', '', price_text.replace(',', ''))
        
        try:
            return float(price_text)
        except ValueError:
            return None
    
    def extract_area(self, area_text: str) -> Optional[float]:
        """
        Extract area in square meters
        
        Args:
            area_text: Text containing area
            
        Returns:
            Area in square meters or None
        """
        if not area_text:
            return None
            
        # Look for patterns like "120 m²", "120m2", "120 sq m"
        match = re.search(r'(\d+(?:\.\d+)?)\s*(?:m²|m2|sq\s*m)', area_text.lower())
        if match:
            return float(match.group(1))
        
        return None
    
    def scrape_generic_listings(self, 
                              listing_page_url: str,
                              listing_selector: str,
                              selectors: Dict[str, str],
                              max_pages: int = 5) -> List[Dict]:
        """
        Generic method to scrape real estate listings
        
        Args:
            listing_page_url: URL of the listings page
            listing_selector: CSS selector for individual listings
            selectors: Dict mapping data fields to CSS selectors
            max_pages: Maximum pages to scrape
            
        Returns:
            List of property dictionaries
        """
        properties = []
        
        for page in range(1, max_pages + 1):
            # Construct page URL (common patterns)
            if '?' in listing_page_url:
                page_url = f"{listing_page_url}&page={page}"
            else:
                page_url = f"{listing_page_url}?page={page}"
            
            soup = self.get_page(page_url)
            if not soup:
                continue
            
            listings = soup.select(listing_selector)
            if not listings:
                self.logger.info(f"No listings found on page {page}")
                break
            
            self.logger.info(f"Found {len(listings)} listings on page {page}")
            
            for listing in listings:
                property_data = self.extract_listing_data(listing, selectors)
                if property_data:
                    properties.append(property_data)
        
        return properties
    
    def extract_listing_data(self, listing, selectors: Dict[str, str]) -> Dict:
        """
        Extract data from a single listing
        
        Args:
            listing: BeautifulSoup element for the listing
            selectors: Dict mapping data fields to CSS selectors
            
        Returns:
            Dictionary with extracted data
        """
        data = {}
        
        for field, selector in selectors.items():
            try:
                element = listing.select_one(selector)
                if element:
                    text = element.get_text(strip=True)
                    
                    # Process specific fields
                    if field == 'price':
                        data[field] = self.extract_price(text)
                        data['price_text'] = text
                    elif field == 'area':
                        data[field] = self.extract_area(text)
                        data['area_text'] = text
                    elif field == 'link':
                        href = element.get('href')
                        if href:
                            data[field] = urljoin(self.base_url, href)
                    else:
                        data[field] = text
            except Exception as e:
                self.logger.warning(f"Error extracting {field}: {e}")
                data[field] = None
        
        return data
    
    def scrape_mubawab_tn(self, max_pages: int = 5) -> List[Dict]:
        """
        Scraper for Mubawab.tn - Leading Tunisian real estate site
        """
        selectors = {
            'title': '.listingBox_title, .listingTitleLink, h3 a',
            'price': '.listingBox_price, .priceTag, .price',
            'location': '.listingBox_location, .address, .location',
            'area': '.listingBox_surface, .surface, .area',
            'rooms': '.listingBox_rooms, .rooms, .bed',
            'type': '.listingBox_type, .propertyType',
            'link': '.listingTitleLink, .listingBox_title a, h3 a',
            'image': '.listingBox_photo img, .propertyPhoto img'
        }
        
        return self.scrape_generic_listings(
            listing_page_url="https://www.mubawab.tn/fr/cc/immobilier-vente",
            listing_selector=".listingBox, .propertyCard, .listing-item",
            selectors=selectors,
            max_pages=max_pages
        )
    
    def scrape_tayara_tn(self, max_pages: int = 5) -> List[Dict]:
        """
        Scraper for Tayara.tn - Popular Tunisian classified ads site
        """
        selectors = {
            'title': '.card-title a, .ad-title, .listing-title',
            'price': '.card-price, .price, .ad-price',
            'location': '.card-location, .location, .ad-location',
            'area': '.card-details, .details, .surface',
            'link': '.card-title a, .ad-title a',
            'image': '.card-image img, .ad-image img'
        }
        
        return self.scrape_generic_listings(
            listing_page_url="https://www.tayara.tn/c/Immobilier",
            listing_selector=".card, .ad-item, .listing-card",
            selectors=selectors,
            max_pages=max_pages
        )
    
    def scrape_tecnocasa_tn(self, max_pages: int = 5) -> List[Dict]:
        """
        Scraper for Tecnocasa.tn - Italian real estate agency in Tunisia
        """
        selectors = {
            'title': '.property-title, .title, h3',
            'price': '.property-price, .price, .cost',
            'location': '.property-location, .location, .address',
            'area': '.property-surface, .surface, .area',
            'rooms': '.property-rooms, .rooms, .bed',
            'type': '.property-type, .type',
            'link': '.property-link, .title a, h3 a'
        }
        
        return self.scrape_generic_listings(
            listing_page_url="https://www.tecnocasa.tn/immobilier-a-vendre",
            listing_selector=".property-item, .property-card, .listing",
            selectors=selectors,
            max_pages=max_pages
        )
    
    def save_to_csv(self, properties: List[Dict], filename: str):
        """
        Save scraped data to CSV
        
        Args:
            properties: List of property dictionaries
            filename: Output CSV filename
        """
        if not properties:
            self.logger.warning("No properties to save")
            return
        
        df = pd.DataFrame(properties)
        df.to_csv(filename, index=False, encoding='utf-8')
        self.logger.info(f"Saved {len(properties)} properties to {filename}")
    
    def save_to_json(self, properties: List[Dict], filename: str):
        """
        Save scraped data to JSON
        
        Args:
            properties: List of property dictionaries
            filename: Output JSON filename
        """
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(properties, f, ensure_ascii=False, indent=2)
        self.logger.info(f"Saved {len(properties)} properties to {filename}")

# Example usage and custom scraper functions
def scrape_custom_site(base_url: str, custom_selectors: Dict[str, str]) -> List[Dict]:
    """
    Function to scrape a custom real estate site
    
    Args:
        base_url: Base URL of the site
        custom_selectors: Dictionary mapping fields to CSS selectors
        
    Returns:
        List of scraped properties
    """
    scraper = TunisiaRealEstateScraper(base_url)
    
    # You need to analyze the website structure and update these
    return scraper.scrape_generic_listings(
        listing_page_url=f"{base_url}/listings",  # Adjust this
        listing_selector=".property-card",        # Adjust this
        selectors=custom_selectors,
        max_pages=5
    )

def main():
    """
    Main function to demonstrate scraping real Tunisian real estate websites
    """
    print("Tunisia Real Estate Scraper")
    print("=" * 50)
    
    # Example 1: Scrape Mubawab.tn (Leading Tunisian real estate site)
    print("\n1. Scraping Mubawab.tn...")
    mubawab_scraper = TunisiaRealEstateScraper("https://www.mubawab.tn")
    mubawab_properties = mubawab_scraper.scrape_mubawab_tn(max_pages=2)
    
    if mubawab_properties:
        mubawab_scraper.save_to_csv(mubawab_properties, "mubawab_properties.csv")
        print(f"✓ Scraped {len(mubawab_properties)} properties from Mubawab.tn")
    
    # Example 2: Scrape Tayara.tn
    print("\n2. Scraping Tayara.tn...")
    tayara_scraper = TunisiaRealEstateScraper("https://www.tayara.tn")
    tayara_properties = tayara_scraper.scrape_tayara_tn(max_pages=2)
    
    if tayara_properties:
        tayara_scraper.save_to_csv(tayara_properties, "tayara_properties.csv")
        print(f"✓ Scraped {len(tayara_properties)} properties from Tayara.tn")
    
    # Example 3: Scrape Tecnocasa.tn
    print("\n3. Scraping Tecnocasa.tn...")
    tecnocasa_scraper = TunisiaRealEstateScraper("https://www.tecnocasa.tn")
    tecnocasa_properties = tecnocasa_scraper.scrape_tecnocasa_tn(max_pages=2)
    
    if tecnocasa_properties:
        tecnocasa_scraper.save_to_csv(tecnocasa_properties, "tecnocasa_properties.csv")
        print(f"✓ Scraped {len(tecnocasa_properties)} properties from Tecnocasa.tn")
    
    # Combine all properties
    all_properties = []
    if mubawab_properties:
        for prop in mubawab_properties:
            prop['source'] = 'Mubawab.tn'
        all_properties.extend(mubawab_properties)
    
    if tayara_properties:
        for prop in tayara_properties:
            prop['source'] = 'Tayara.tn'
        all_properties.extend(tayara_properties)
    
    if tecnocasa_properties:
        for prop in tecnocasa_properties:
            prop['source'] = 'Tecnocasa.tn'
        all_properties.extend(tecnocasa_properties)
    
    # Save combined results
    if all_properties:
        combined_scraper = TunisiaRealEstateScraper("https://combined.tn")
        combined_scraper.save_to_csv(all_properties, "tunisia_all_properties.csv")
        combined_scraper.save_to_json(all_properties, "tunisia_all_properties.json")
        
        print(f"\n🎉 Total scraped: {len(all_properties)} properties from all sites")
        print("📁 Files saved:")
        print("   - mubawab_properties.csv")
        print("   - tayara_properties.csv") 
        print("   - tecnocasa_properties.csv")
        print("   - tunisia_all_properties.csv (combined)")
        print("   - tunisia_all_properties.json (combined)")
        
        # Display sample results
        if all_properties:
            print("\n📋 Sample property from results:")
            sample_prop = all_properties[0]
            for key, value in sample_prop.items():
                if value and key not in ['link', 'image']:  # Skip long URLs
                    print(f"   {key}: {value}")
    
    print("\n" + "=" * 50)
    print("🚀 USAGE INSTRUCTIONS:")
    print("1. Install: pip install requests beautifulsoup4 pandas")
    print("2. Run this script to scrape real Tunisian real estate sites")
    print("3. Check output CSV/JSON files for scraped data")
    print("4. Modify selectors if website structure changes")
    print("\n⚠️  IMPORTANT NOTES:")
    print("• Always check robots.txt before scraping")
    print("• Respect rate limits (current: 1 second delay)")
    print("• Some sites may block automated requests")
    print("• CSS selectors may change - update as needed")
    print("• Consider using proxies for large-scale scraping")

# Additional utility functions for specific Tunisian real estate sites
def scrape_multiple_tunisia_sites(max_pages_per_site: int = 3) -> pd.DataFrame:
    """
    Scrape multiple Tunisian real estate sites and return combined DataFrame
    
    Args:
        max_pages_per_site: Maximum pages to scrape per site
        
    Returns:
        Combined DataFrame with all properties
    """
    all_properties = []
    
    # Site configurations
    sites = [
        {
            'name': 'Mubawab.tn',
            'scraper_method': 'scrape_mubawab_tn',
            'base_url': 'https://www.mubawab.tn'
        },
        {
            'name': 'Tayara.tn', 
            'scraper_method': 'scrape_tayara_tn',
            'base_url': 'https://www.tayara.tn'
        },
        {
            'name': 'Tecnocasa.tn',
            'scraper_method': 'scrape_tecnocasa_tn', 
            'base_url': 'https://www.tecnocasa.tn'
        }
    ]
    
    for site in sites:
        try:
            print(f"Scraping {site['name']}...")
            scraper = TunisiaRealEstateScraper(site['base_url'])
            method = getattr(scraper, site['scraper_method'])
            properties = method(max_pages=max_pages_per_site)
            
            # Add source information
            for prop in properties:
                prop['source'] = site['name']
                prop['scraped_at'] = pd.Timestamp.now()
            
            all_properties.extend(properties)
            print(f"✓ Got {len(properties)} properties from {site['name']}")
            
        except Exception as e:
            print(f"✗ Error scraping {site['name']}: {e}")
            continue
    
    return pd.DataFrame(all_properties) if all_properties else pd.DataFrame()

def analyze_tunisia_property_prices(df: pd.DataFrame) -> Dict:
    """
    Basic analysis of scraped Tunisian property data
    
    Args:
        df: DataFrame with property data
        
    Returns:
        Dictionary with analysis results
    """
    if df.empty:
        return {"error": "No data provided"}
    
    analysis = {}
    
    # Price analysis
    if 'price' in df.columns:
        prices = df['price'].dropna()
        if not prices.empty:
            analysis['price_stats'] = {
                'avg_price': prices.mean(),
                'median_price': prices.median(),
                'min_price': prices.min(),
                'max_price': prices.max(),
                'total_properties': len(prices)
            }
    
    # Location analysis
    if 'location' in df.columns:
        locations = df['location'].value_counts().head(10)
        analysis['top_locations'] = locations.to_dict()
    
    # Source analysis
    if 'source' in df.columns:
        sources = df['source'].value_counts()
        analysis['properties_per_source'] = sources.to_dict()
    
    return analysis

if __name__ == "__main__":
    main()