In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

# Empty lists for 8 columns
bike_name = []
price = []
kms_driven = []
ownership = []
location = []
year = []
brand = []
variant = []

# Set to track unique bikes (prevent duplicates)
unique_bikes = set()

def extract_year_brand_variant(name):
    """Extract year, brand, and variant from bike name"""
    parts = name.strip().split()
    
    # Extract year (4-digit number at start)
    yr = "N/A"
    if parts and re.match(r"^\d{4}$", parts[0]):
        yr = parts[0]
        parts = parts[1:]  # Remove year from parts
    
    # Extract brand (usually first word after year)
    br = parts[0] if parts else "N/A"
    
    # Extract variant (rest of the name)
    var = " ".join(parts[1:]) if len(parts) > 1 else "N/A"
    
    return yr, br, var

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}

# Scrape multiple pages
for page_num in range(1,100):  # 59 pages total, will stop at 1000
    
    url = f"https://bikekharido.in/used-bikes-in-india/?sf_paged={page_num}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all bike cards - they are within divs with specific structure
    # Each card has heading with bike name, and details in labeled sections
    bike_cards = soup.find_all('div', class_='fl-post-column')
    
    if not bike_cards:
        # Try alternative selector
        bike_cards = soup.find_all('article')
    
    for card in bike_cards:
        try:
            # Extract bike name from heading or link
            name_elem = card.find('h3') or card.find('h2')
            if not name_elem:
                name_elem = card.find('a', href=lambda x: x and '/buy-used-bike/' in x)
            
            if not name_elem:
                continue
                
            bike_name_text = name_elem.get_text(strip=True)
            
            # Extract price (look for "Seller Demand" label) - FIXED: using 'string' instead of 'text'
            price_text = "N/A"
            price_section = card.find(string=re.compile("Seller Demand", re.IGNORECASE))
            if price_section:
                price_elem = price_section.find_next()
                if price_elem:
                    price_text = price_elem.get_text(strip=True)
            
            # Extract KMs driven - FIXED: using 'string' instead of 'text'
            kms_text = "N/A"
            kms_section = card.find(string=re.compile("KM Driven", re.IGNORECASE))
            if kms_section:
                kms_elem = kms_section.find_next()
                if kms_elem:
                    kms_text = kms_elem.get_text(strip=True)
            
            # Extract ownership - FIXED: using 'string' instead of 'text'
            ownership_text = "N/A"
            ownership_section = card.find(string=re.compile("Ownership", re.IGNORECASE))
            if ownership_section:
                ownership_elem = ownership_section.find_next()
                if ownership_elem:
                    ownership_text = ownership_elem.get_text(strip=True)
            
            # Extract location - FIXED: using 'string' instead of 'text'
            location_text = "N/A"
            location_section = card.find(string=re.compile("Location", re.IGNORECASE))
            if location_section:
                location_elem = location_section.find_next()
                if location_elem:
                    location_text = location_elem.get_text(strip=True)
            
            # Create unique identifier to prevent duplicates
            unique_id = (bike_name_text, price_text, kms_text, location_text)
            
            # Only add if all required fields are present and not duplicate
            if (bike_name_text != "N/A" and price_text != "N/A" and 
                kms_text != "N/A" and ownership_text != "N/A" and 
                location_text != "N/A" and unique_id not in unique_bikes):
                
                # Extract year, brand, variant
                yr, br, var = extract_year_brand_variant(bike_name_text)
                
                # Add to lists
                bike_name.append(bike_name_text)
                price.append(price_text)
                kms_driven.append(kms_text)
                ownership.append(ownership_text)
                location.append(location_text)
                year.append(yr)
                brand.append(br)
                variant.append(var)
                
                # Add to unique set
                unique_bikes.add(unique_id)
                
        
        except Exception as e:
            print(f"  Error processing card: {e}")
            continue
    
    # Stop if we have 1000 unique bikes
    if len(bike_name) >= 1000:
        print(f"Reached 1000 bikes! Stopping...")
        break
    
    time.sleep(1)  # Be polite to the server

# Create DataFrame with exactly 1000 rows (or less if not enough data)
final_count = min(len(bike_name), 1000)
df = pd.DataFrame({
    "bike_name": bike_name[:final_count],
    "price": price[:final_count],
    "kms_driven": kms_driven[:final_count],
    "ownership": ownership[:final_count],
    "location": location[:final_count],
    "year": year[:final_count],
    "brand": brand[:final_count],
    "variant": variant[:final_count]
})

# Save to CSV
df.to_csv("bikekharido_bikes123.csv", index=False)
df

Unnamed: 0,bike_name,price,kms_driven,ownership,location,year,brand,variant
0,Jupiter Grande Edition bs4,"₹ 35,000",35000 km,First,Delhi,,Jupiter,Grande Edition bs4
1,2021 Bajaj Pulsar NS200 ABS,"₹ 90,000",5000 km,First,Kolkata,2021,Bajaj,Pulsar NS200 ABS
2,2024 Yamaha R15 V4 Racing Blue,"₹ 1,70,000",8000 km,First,Rajkot,2024,Yamaha,R15 V4 Racing Blue
3,2023 Suzuki Access 125 Disc Brake CBS Special ...,"₹ 65,000",35000 km,First,Jaipur,2023,Suzuki,Access 125 Disc Brake CBS Special Edition
4,2025 Yamaha RayZR 125 Fi Hybrid,"₹ 10,500",3700 km,First,Ranchi,2025,Yamaha,RayZR 125 Fi Hybrid
...,...,...,...,...,...,...,...,...
691,2019 Royal Enfield Interceptor 650 Baker Express,"₹ 2,15,000",6500 km,First,Noida,2019,Royal,Enfield Interceptor 650 Baker Express
692,2017 Bajaj Avenger Street 160,"₹ 45,000",30000 km,First,Delhi,2017,Bajaj,Avenger Street 160
693,2018 Hero Super Splendor Canvas Black Edition ...,"₹ 42,000",20000 km,Second,Delhi,2018,Hero,Super Splendor Canvas Black Edition Drum
694,2022 Royal Enfield Classic 350 Chrome Series W...,"₹ 2,00,000",15000 km,First,Allahabad,2022,Royal,Enfield Classic 350 Chrome Series With Dual-Ch...
