In [6]:
#importing libraries
import selenium
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import WebDriverException


In [10]:
#initialize chrome webdriver
driver = webdriver.Chrome()
driver.maximize_window()
#read URLS from csv to dataframe including bus link and bus route
urls_df = pd.read_csv(r"C:\Users\jagadesh\Documents\Python Scripts\red bus data scraping project\df_redbus_all_links_routes.csv", usecols= ["bus route", "bus link"])

#initialize a list to store all dataframes
all_dfs = []

#loop through each url in dataframce
for i, row in urls_df.iterrows():
    url = str(row['bus link']).strip()  # Convert to string and strip whitespace
    bus_route = row['bus route']
    try:
        # Check if URL is valid (not NaN or malformed)
        if not url or url.lower() == 'nan':
            print(f"Skipping invalid URL at index {i}: {url}")
            continue
        
        #navigate to the URL
        driver.get(url)
        print(f"Scraping data from: {url}")
        
        #wait for the page to load
        time.sleep(10)
        
        #scroll down to the bottom of the page to load all contents
        actions = ActionChains(driver)
        last_height = driver.execute_script("return document.body.scrollHeight")
        
        while True:
            actions.send_keys(Keys.PAGE_DOWN).perform()
            time.sleep(5)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
        
        #extract the web elements
        results = driver.find_elements(By.XPATH, '//div[@class="clearfix bus-item"]')
        
        #initialize an empty list to store the results from the current URL
        data_list = []
        
        for result in results:
            try:
                bus_name = result.find_element(By.XPATH, './/div[@class="travels lh-24 f-bold d-color"]').text.strip()
            except:
                bus_name = None
            
            try:
                bus_type = result.find_element(By.XPATH, './/div[@class="bus-type f-12 m-top-16 l-color evBus"]').text.strip()
            except:
                bus_type = None
            
            try:
                depart_time = result.find_element(By.XPATH, './/div[@class="dp-time f-19 d-color f-bold"]').text.strip()
            except:
                depart_time = None
            
            try:
                arr_time = result.find_element(By.XPATH, './/div[@class="bp-time f-19 d-color disp-Inline"]').text.strip()
            except:
                arr_time = None
            
            try:
                dur = result.find_element(By.XPATH, './/div[@class="dur l-color lh-24"]').text.strip()
            except:
                dur = None
            
            try:
                fare = result.find_element(By.CSS_SELECTOR, 'span.f-19.f-bold').text.strip()
            except:
                dur = None
            
            try:
                rating = result.find_element(By.XPATH, './/div[contains(@class, "rating-sec") and contains(@class, "lh-24")]').text.strip()
            except:
                rating =None
            
            try:
                seat_availability = result.find_element(By.XPATH, './/div[contains(@class, "seat-left") and contains(@class, "m-top-30")]').text.strip()
            except:
                seat_availability = None
        
            #append data as a tuple to list,including bus_link and bus_route
            data_list.append((bus_name, bus_type, depart_time, arr_time, dur, fare, rating, seat_availability, url, bus_route))
        
        #create DataFrame from the list of tuples for the current URL
        df = pd.DataFrame(data_list, columns=["Bus_name","Bus Type","Departure Time","Arrival Time","Duration","Fare","Rating","Seat_Availability","Bus Link","Bus Route"])
        
        #Append the DataFrame to the list of all DataFrames
        all_dfs.append(df)

    except WebDriverException as e:
        print(f"Error navigating to URL:{url}")
        print(str(e))#print the exception details for debugging purposes

#close the selenium driver
driver.quit()

#concatenate all DataFrames in the list into a single DataFrame
total_results = pd.concat(all_dfs, ignore_index=True)

#save the concatenated DataFrame to a CSV file
path=r"C:/Users/jagadesh/Documents/Python Scripts/red bus data scraping project/redbus_all_data.csv"
total_results.to_csv(path,index=False)

#print the final concatenated DataFrame
print(total_results)

Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-vijayawada
Scraping data from: https://www.redbus.in/bus-tickets/khammam-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-khammam
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-srisailam
Scraping data from: https://www.redbus.in/bus-tickets/karimnagar-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-karimnagar
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-mancherial
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-nirmal
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-adilabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-ongole
Scraping data from: https://www.redbus.in/bus-tickets/kothagudem-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/guntur-to-hyderabad
Scraping data from: https://www.redbus.in/bus-tickets/hyderabad-to-guntur
Scraping da

In [11]:
total_results

Unnamed: 0,Bus_name,Bus Type,Departure Time,Arrival Time,Duration,Fare,Rating,Seat_Availability,Bus Link,Bus Route
0,FRESHBUS,Electric A/C Seater (2+2),23:10,05:35,06h 25m,829,4.5,17 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
1,IntrCity SmartBus,Bharat Benz A/C Seater /Sleeper (2+1),23:50,05:35,05h 45m,579,4.4,14 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
2,Zingbus Plus,A/C Seater / Sleeper (2+1),23:38,06:05,06h 27m,834,4.5,30 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
3,AdIntrCity SmartBus,A/C Seater / Sleeper (2+1),23:05,06:00,06h 55m,839,4.4,20 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
4,IntrCity SmartBus,Scania AC Multi Axle Sleeper (2+1),23:59,06:30,06h 31m,1479,4.5,11 Seats available,https://www.redbus.in/bus-tickets/hyderabad-to...,Hyderabad to Vijayawada
...,...,...,...,...,...,...,...,...,...,...
1139,Samay Shatabdi Travels Pvt Ltd,AC Sleeper (2+1),22:01,06:16,08h 15m,1499,4.9,1 Seat available,https://www.redbus.in/bus-tickets/gopalganj-to...,Gopalganj (Bihar) to Lucknow
1140,Panwar Travels,A/C Sleeper (2+1),17:50,01:40,07h 50m,1850,4.8,26 Seats available,https://www.redbus.in/bus-tickets/gopalganj-to...,Gopalganj (Bihar) to Lucknow
1141,Baba Khatushyam Travels Pvt Ltd,A/C Seater / Sleeper (3+1),19:00,03:00,08h 00m,1050,1.5,56 Seats available,https://www.redbus.in/bus-tickets/gopalganj-to...,Gopalganj (Bihar) to Lucknow
1142,Travel Point World LLP,A/C Seater / Sleeper (2+2),20:20,03:20,07h 00m,1999,1.6,38 Seats available,https://www.redbus.in/bus-tickets/gopalganj-to...,Gopalganj (Bihar) to Lucknow
