In [None]:
import time
import csv
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Open the URL in the browser
driver.get('https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile') # URL for scraping the government bus data. Paste the link as per other government bus services to scrape.
driver.maximize_window()
time.sleep(10)

# Function to extract routes from the current page
def extract_routes():
    elements = driver.find_elements(By.XPATH, "//a[@class='route']")
    return [{'text': element.text, 'link': element.get_attribute('href')} for element in elements]

# Initialize routes list
all_routes = []

# Try to navigate through pages and capture routes
page_xpaths = [
    '//*[@id="root"]/div/div[4]/div[12]/div[2]',  # Page 2
    '//*[@id="root"]/div/div[4]/div[12]/div[3]',  # Page 3
    '//*[@id="root"]/div/div[4]/div[12]/div[4]',  # Page 4
    '//*[@id="root"]/div/div[4]/div[12]/div[5]'   # Page 5
]

# Extract routes from the first page
all_routes.extend(extract_routes())

# Loop through each page's XPath and extract routes
for page_xpath in page_xpaths:
    try:
        # Wait for the element to be clickable
        element = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, page_xpath)))

        # Scroll the element into view
        driver.execute_script("arguments[0].scrollIntoView(true);", element)
        time.sleep(2)

        # Click the element using JavaScript
        driver.execute_script("arguments[0].click();", element)
        time.sleep(10)

        # Extract routes from the current page
        all_routes.extend(extract_routes())

    except Exception as e:
        print(f"Could not navigate to the next page or extract routes: {e}")
        continue

# Prepare the CSV file
csv_file_path = 'apsrtc_bus_data.csv'
with open(csv_file_path, 'w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['Route Name', 'Route Link', 'Bus Name', 'Bus Type', 'Departing Time', 'Duration', 'Reaching Time', 'Star Rating', 'Price', 'Seats Available'])

    for route in all_routes:
        # Navigate to the route page
        driver.get(route['link'])
        time.sleep(10)

        try:
            # Attempt to click on the "View Buses" element if it exists
            view_buses_xpath = '//*[@id="result-section"]/div[1]/div/div[2]/div/div[4]/div[2]'
            view_buses = WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, view_buses_xpath)))
            view_buses.click()
            time.sleep(5)
        except TimeoutException:
            print(f"'View Buses' button not found for route {route['text']}. Extracting available data.")

        # Scroll down to the bottom of the page to load all content
        SCROLL_PAUSE_TIME = 5
        last_height = driver.execute_script("return document.body.scrollHeight")

        while True:
            # Scroll down to the bottom
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(SCROLL_PAUSE_TIME)
            
            # Calculate new scroll height and compare with last scroll height
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Extract data after reaching the bottom of the page
        def get_elements(xpath):
            return [elem.text for elem in driver.find_elements(By.XPATH, xpath)]

        busname_xpath = "//div[@class='travels lh-24 f-bold d-color']"
        bustype_xpath = "//*[@class='bus-type f-12 m-top-16 l-color evBus']"
        departing_time_xpath = "//*[@class='dp-time f-19 d-color f-bold']"
        duration_xpath = "//*[@class='dur l-color lh-24']"
        reaching_time_xpath = "//*[@class='bp-time f-19 d-color disp-Inline']"
        star_rating_xpath = "//*[@class='column-six p-right-10 w-10 fl']"
        price_xpath = "//*[contains(@class, 'fare d-block')]"
        seats_available_xpath = "//*[@class='column-eight w-15 fl']"

        busnames = get_elements(busname_xpath)
        bustypes = get_elements(bustype_xpath)
        departing_times = get_elements(departing_time_xpath)
        durations = get_elements(duration_xpath)
        reaching_times = get_elements(reaching_time_xpath)
        star_ratings = get_elements(star_rating_xpath)
        prices = get_elements(price_xpath)
        seats_available = get_elements(seats_available_xpath)

        # Define the clean_price function
        def clean_price(price_text):
            return re.sub(r'[^0-9.]', '', price_text.strip())

        # Clean and extend lists
        prices = [clean_price(price) for price in prices]

        length = max(len(busnames), len(bustypes), len(departing_times), len(durations), len(reaching_times), len(star_ratings), len(prices), len(seats_available))

        def extend_list(lst, length):
            return lst + ['N/A'] * (length - len(lst))

        busnames = extend_list(busnames, length)
        bustypes = extend_list(bustypes, length)
        departing_times = extend_list(departing_times, length)
        durations = extend_list(durations, length)
        reaching_times = extend_list(reaching_times, length)
        star_ratings = extend_list(star_ratings, length)
        prices = extend_list(prices, length)
        seats_available = extend_list(seats_available, length)

        # Write data to CSV
        for i in range(length):
            writer.writerow([
                route['text'],
                route['link'],
                busnames[i],
                bustypes[i],
                departing_times[i],
                durations[i],
                reaching_times[i],
                star_ratings[i],
                prices[i],
                seats_available[i]
            ])

        # Go back to the main route list
        driver.back()
        time.sleep(10) 

# Close the browser
driver.quit()
