In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

# Function to scrape routes and links
def main(driver, wait, path):
    ROUTE_LINKS = []
    BUS_ROUTES = []
    current_page = 1
    
    while True:
        try:
        
            paths = wait.until(EC.presence_of_all_elements_located((By.XPATH, path)))
            print(f"Found {len(paths)} elements on page {current_page}")
            
            for link in paths:
                href = link.get_attribute("href")
                ROUTE_LINKS.append(href)

            for route in paths:
                BUS_ROUTES.append(route.text)
            
            # Try to click the next page button
            next_page_text = str(current_page + 1)
            pagination = wait.until(EC.presence_of_element_located((By.XPATH, "//*[@class='DC_117_paginationTable']")))
            next_button = pagination.find_element(By.XPATH, f'//div[@class="DC_117_pageTabs " and text()="{next_page_text}"]')
            
            # Move to the next button and click it
            actions = ActionChains(driver)
            actions.move_to_element(next_button).click().perform()
            
            # Update the current page number
            current_page += 1
            time.sleep(5)  # Wait for the page to load
            
        except NoSuchElementException:
            print(f"No more pages to paginate for {path}.")
            break

    return ROUTE_LINKS, BUS_ROUTES

# Define URLs and paths for different states
state_info = {
    'Andhra Pradesh': {
        'url': 'https://www.redbus.in/online-booking/apsrtc/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Telangana': {
        'url': 'https://www.redbus.in/online-booking/tsrtc/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Kerala': {
        'url': 'https://www.redbus.in/online-booking/ksrtc-kerala/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Goa': {
        'url': 'https://www.redbus.in/online-booking/ktcl/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Rajasthan': {
        'url': 'https://www.redbus.in/online-booking/rsrtc/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Himachal': {
        'url': 'https://www.redbus.in/online-booking/hrtc/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Assam': {
        'url': 'https://www.redbus.in/online-booking/astc/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Chandigarh': {
        'url': 'https://www.redbus.in/online-booking/chandigarh-transport-undertaking-ctu',
        'path': "//a[@class='route']"
    },
    'Punjab': {
        'url': 'https://www.redbus.in/online-booking/pepsu/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    },
    'Uttar Pradesh': {
        'url': 'https://www.redbus.in/online-booking/uttar-pradesh-state-road-transport-corporation-upsrtc/?utm_source=rtchometile',
        'path': "//a[@class='route']"
    }
}

# Initialize the Chrome driver
driver = webdriver.Chrome()

andhra_pradesh_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
telangana_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
kerala_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
goa_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
rajasthan_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
himachal_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
assam_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
chandigarh_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
punjab_df = pd.DataFrame(columns=['State', 'Route', 'Link'])
uttar_pradesh_df = pd.DataFrame(columns=['State', 'Route', 'Link'])

state_dataframes = {
    'Andhra Pradesh': andhra_pradesh_df,
    'Telangana': telangana_df,
    'Kerala': kerala_df,
    'Goa': goa_df,
    'Rajasthan': rajasthan_df,
    'Himachal': himachal_df,
    'Assam': assam_df,
    'Chandigarh': chandigarh_df,
    'Punjab': punjab_df,
    'Uttar Pradesh': uttar_pradesh_df
}

for state, info in state_info.items():
    print(f"Scraping routes for {state}...")
    
    driver.get(info['url'])
    wait = WebDriverWait(driver, 30)
    
    # Scrape data for the current state
    ROUTE_LINKS, BUS_ROUTES = main(driver, wait, info['path'])
    
    # DataFrame for the current state
    df = pd.DataFrame({
        'State': [state] * len(BUS_ROUTES),
        'Bus_Route': BUS_ROUTES,
        'Route_Link': ROUTE_LINKS
    })
    
    # Store the DataFrame in the appropriate variable
    state_dataframes[state] = df
    
    df.to_csv(f'{state}_routes.csv', index=False)
    
    # Print the DataFrame
    print(f"\nDataFrame for {state}:")
    print(df)

# Close the driver
driver.quit()


In [None]:
df_k=state_dataframes['Kerala']
df_ap=state_dataframes['Andhra Pradesh']
df_t=state_dataframes['Telangana']
df_g=state_dataframes['Goa']
df_r=state_dataframes['Rajasthan']
df_h=state_dataframes['Himachal']
df_a=state_dataframes['Assam']
df_c=state_dataframes['Chandigarh']
df_p=state_dataframes['Punjab']
df_up=state_dataframes['Uttar Pradesh']

In [None]:
#Main scrapping function
def scrape_bus_data(df):
    # Initialize WebDriver
    driver = webdriver.Chrome()
    wait = WebDriverWait(driver, 10)  # WebDriverWait for explicit waits

    # Lists to store data
    Bus_names = []
    Bus_types = []
    Start_Time = []
    End_Time = []
    Ratings = []
    Total_Duration = []
    Prices = []
    Seats_Available = []
    State_list = []
    Route_names = []
    Route_links = []

    for i, r in df.iterrows():
        state = r['State']
        link = r["Route_Link"]
        routes = r["Bus_Route"]

        driver.get(link)
        time.sleep(2)

        # Click on elements to reveal bus details
        elements = driver.find_elements(By.XPATH, f'//a[contains(@href, "{link}")]')
        for element in elements:
            try:
                element.click()
                time.sleep(3)
            except Exception as e:
                print(f"Error clicking element: {e}")

        # Click elements to view bus
        try:
            clicks = driver.find_element(By.XPATH, "//div[@class='button']")
            clicks.click()
            time.sleep(2)
        except Exception as e:
            print(f"Error clicking view bus button: {e}")

        # Scrolling mechanism
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(3)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

        # Scraping bus details
        bus_name = driver.find_elements(By.XPATH, "//div[@class='travels lh-24 f-bold d-color']")
        bus_types = driver.find_elements(By.XPATH, "//div[@class='bus-type f-12 m-top-16 l-color evBus']")
        start_time = driver.find_elements(By.XPATH, "//div[@class='dp-time f-19 d-color f-bold']")
        end_time = driver.find_elements(By.XPATH, "//div[@class='bp-time f-19 d-color disp-Inline']")
        total_duration = driver.find_elements(By.XPATH, "//div[@class='dur l-color lh-24']")

        try:
            ratings = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div.clearfix.bus-item-details div.clearfix.row-one div.rating-sec span")))
        except Exception as e:
            print(f"Error finding ratings: {e}")
            continue

        prices = driver.find_elements(By.XPATH, "//div[@class='fare d-block']")
        seats = driver.find_elements(By.XPATH, "//div[contains(@class, 'seat-left')]")

        # Append details to lists
        for bus in bus_name:
            Bus_names.append(bus.text)
            State_list.append(state)
            Route_links.append(link)
            Route_names.append(routes)

        for bus_type in bus_types:
            Bus_types.append(bus_type.text)

        for start in start_time:
            Start_Time.append(start.text)

        for end in end_time:
            End_Time.append(end.text)

        for duration in total_duration:
            Total_Duration.append(duration.text)

        for rating in ratings:
            Ratings.append(rating.text)

        for price in prices:
            Prices.append(price.text)

        for seat in seats:
            Seats_Available.append(seat.text)

    # Convert lists to DataFrame
    data = {
        'Bus_name': Bus_names,
        'Bus_type': Bus_types,
        'Start_time': Start_Time,
        'End_time': End_Time,
        'Total_duration': Total_Duration,
        'Price': Prices,
        'Seats_Available': Seats_Available,
        'Ratings': Ratings,
        'State': State_list,
        'Route_link': Route_links,
        'Route_name': Route_names
    }

    bus_details = pd.DataFrame(data)

    # Close the WebDriver
    driver.quit()
    
    return bus_details


In [None]:
#Kerala bus detials
df_k = state_dataframes['Kerala']
k_bus_details = scrape_bus_data(df_k)
k_bus_details.to_csv('k_bus_details.csv')

In [None]:
#andhra bus detials
df_ap=state_dataframes['Andhra Pradesh']
ap_bus_details = scrape_bus_data(df_ap)
ap_bus_details.to_csv('ap_bus_details.csv')

In [None]:
#telengana bus detials
df_t=state_dataframes['Telangana']
t_bus_details = scrape_bus_data(df_t)
t_bus_details.to_csv('t_bus_details.csv')

In [None]:
#goa bus detials
df_g=state_dataframes['Goa']
g_bus_details = scrape_bus_data(df_g)
g_bus_details.to_csv('g_bus_details.csv')

In [None]:
#rajasthan bus detials
df_r=state_dataframes['Rajasthan']
r_bus_details = scrape_bus_data(df_r)
r_bus_details.to_csv('r_bus_details.csv')

In [None]:
#Himachal bus detials
df_h=state_dataframes['Himachal']
h_bus_details = scrape_bus_data(df_h)
h_bus_details.to_csv('h_bus_details.csv')

In [None]:
#Assam bus detials
df_a=state_dataframes['Assam']
a_bus_details = scrape_bus_data(df_a)
a_bus_details.to_csv('a_bus_details.csv')

In [None]:
#Chandigarh bus detials
df_c=state_dataframes['Chandigarh']
c_bus_details = scrape_bus_data(df_c)
c_bus_details.to_csv('c_bus_details.csv')

In [None]:
#Punjab bus detials
df_p=state_dataframes['Punjab']
p_bus_details = scrape_bus_data(df_p)
p_bus_details.to_csv('p_bus_details.csv')

In [None]:
#Uttar Pradesh bus detials
df_up=state_dataframes['Uttar Pradesh']
up_bus_details = scrape_bus_data(df_up)
up_bus_details.to_csv('up_bus_details.csv')