## Importing required modules

In [1]:
import pandas as pd
import matplotlib
import numpy as np
import datetime as dt
import time
import os
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait

### Import routes for searching the data
'Routes/routes.csv' contains From, To and Distance information. 


In [2]:
routes = pd.read_csv("Routes/routes.csv", ',')
print(routes)

             From                  To  Distance
0       Bangalore          Ananthapur       214
1       Bangalore             Chennai       347
2       Bangalore          Coimbatore       365
3       Bangalore               Hubli       414
4       Bangalore           Hyderabad       569
5       Bangalore               Kochi       551
6       Bangalore            Kolhapur       616
7       Bangalore             Kurnool       360
8       Bangalore             Madurai       436
9       Bangalore           Mangalore       352
10      Bangalore              Mumbai       985
11      Bangalore             Nellore       388
12      Bangalore                Pune       841
13      Bangalore               Salem       205
14      Bangalore  Thiruvananthapuram       686
15      Bangalore      Tiruchirapalli       344
16      Bangalore               Udupi       405
17      Bangalore             Vellore       213
18      Bangalore          Vijayawada       664
19      Bangalore             Belgaum   

### Extract Data Function 

In [3]:
def extract_data(driver):
    
    wait = WebDriverWait(driver, 10)
    try:
        bus_list = wait.until(ec.presence_of_element_located((By.CLASS_NAME, 'bus-items')))            
    except:
        return None
    
    bus_items = bus_list.find_elements_by_xpath("//li[contains(@class, 'row-sec')]")
    print("No. of busses found for the route : " + str(len(bus_items)))
    print("Fetching data...")
    
    bus_id = [i.get_attribute("id") for i in bus_items]
    #print(len(bus_id))
    bus_operator = [i.find_element_by_class_name("travels").text for i in bus_items]
    #print(len(bus_operator))
    bus_type     = [i.find_element_by_class_name("bus-type").text for i in bus_items]
    #print(len(bus_type))
    dep_time     = [i.find_element_by_class_name("dp-time").text for i in bus_items]
    #print(len(dep_time))
    dep_loc      = [i.find_element_by_class_name("dp-loc").text for i in bus_items]
    #print(len(dep_loc))
    dur          = [i.find_element_by_class_name("dur").text for i in bus_items]
    #print(len(dur))
    bp_time      = [i.find_element_by_class_name("bp-time").text for i in bus_items]
    #print(len(bp_time))
    bp_loc       = [i.find_element_by_class_name("bp-loc").text for i in bus_items]
    #print(len(bp_loc))
    seat_fare    = [i.find_element_by_class_name("seat-fare").text for i in bus_items]
    #print(len(seat_fare))
    seats_left   = [i.find_element_by_class_name("seat-left").text for i in bus_items]
    #print(len(seats_left))
    
    # Under Utilization
    seat_bars    = driver.find_elements_by_xpath("//div[contains(@class, 'light-g-bar')]/div")
    print(len(seat_bars))
    seat_util = [i.size['width'] for i in seat_bars]
    under_util = [under_utilization(i) for i in seat_util]
    util = [100-i for i in under_util]
    print(len(under_util))

    # Bus rating
    bus_rating = []
    for i in bus_items:
        try:
            rat = i.find_element_by_class_name("rating-sec").text
            bus_rating.append(rat)
        except:
            bus_rating.append("-")
    print(len(bus_rating))
    
    # Creating a Data frame from the lists
    df_red_bus = pd.DataFrame({'Travels'  : bus_operator,
                           'Bus Type' : bus_type,
                           'Bus Id' : bus_id,
                           'Dep Time' : dep_time,
                           'Dep Loc'  : dep_loc,
                           'Duration' : dur,
                           'Arrival Time' : bp_time,
                           'Arrival Loc' : bp_loc,
                           'Seat Fare' : seat_fare,
                           'Seats Left': seats_left,
                           'Utilization' : util,
                           'Rating' : bus_rating,
                          })
    
    now = dt.datetime.now()
    current_time = now.strftime("%d %b %y %H:%M")
    df_red_bus['Time Extracted'] = current_time
    
    df_red_bus = df_red_bus[['Travels','Bus Type', 'Bus Id', 'Dep Loc', 'Dep Time','Duration','Arrival Loc', 'Arrival Time',
    'Seat Fare', 'Seats Left', 'Utilization', 'Rating', 'Time Extracted']]
    
    df_red_bus.set_index(['Bus Id', 'Time Extracted'], inplace=True)
    
    def mxprice(a):
        res = [float(i) for i in a.split() if i.replace('.', '').isdigit()] 
        return(max(res))
    df_red_bus['Seat Fare']=df_red_bus['Seat Fare'].apply(mxprice)
    
    '''Segmentation based upon Bus Type'''
    def mer(x):
        if x in ['A/C Sleeper (2+1)','Bharat Benz A/C Sleeper (2+1)','Capella A/C Sleeper (2+1)','Corona A/C Sleeper (2+1)','Luxura A/C Sleeper (2+1)']:
            return('Indian A/C Sleeper')
        elif x in ['NON A/C Seater (1+1+1)','NON A/C Seater (2+2)','NON A/C Airbus (2+2)', 'NON A/C Semi Sleeper (2+2)']:
            return('Indian Non A/C Seater')
        elif x in ['NON A/C Seater / Sleeper (2+1)']:
            return('Indian Non A/C Seater/Sleeper')
        elif x in ['A/C Seater / Sleeper (2+1)']:
            return('A/C Seater / Sleeper (2+1)')
        elif x in ['NON A/C Sleeper (2+1)']:
            return('Indian Non A/C Sleeper')
        elif x in ['Volvo Multi-Axle A/C seater/sleeper (2+1)']:
            return('Foreign A/C Seater/Sleeper')
        elif x in ['Scania Multi-Axle A/C Semi Sleeper (2+2)','Volvo A/C B11R Multi Axle Semi Sleeper (2+2)','Volvo Multi-Axle A/C Semi Sleeper (2+2)','Volvo Multi-Axle I-Shift A/C Semi Sleeper (2+2)','Volvo Multi-Axle I-Shift B11R Semi Sleeper (2+2)']:
            return('Foreign A/C Seater')
        elif x in ['Volvo Multi-Axle Sleeper A/C (2+1)','Volvo Multi-Axle A/C Sleeper (2+1)','Mercedes Multi-Axle Semi Sleeper (2+2)','Volvo Multi-Axle I-Shift A/C Sleeper (2+1)','Scania AC Multi Axle Sleeper (2+1)', 'Volvo Multi-Axle I-Shift B11R Sleeper (2+1)']:
            return('Foreign A/C Sleeper')
    
        
    df_red_bus['Bus Type']=df_red_bus['Bus Type'].apply(mer)
    
    return df_red_bus
    
    

In [4]:
# Under utilization function
def under_utilization(width):
    ratio = (72-width)/72
    return ratio*100


# Aggregating function
def aggregate(source_city, destination_city, new_data, extract_date, time_group):
    
    # File path 
    route_path = 'Data/' + source_city + '-' + destination_city
    file_path = route_path + '/' + source_city + '_to_' + destination_city + '.csv'    
    try:
        previous_data = pd.read_csv(file_path, index_col=['Bus Id', 'Time Extracted'])
        combined_data = pd.concat([previous_data, new_data])
        combined_data.to_csv(file_path, index=True)
    except:
        new_data.to_csv(file_path, index=True)
        
    
    # For every day 
    date = extract_date.strftime("%d-%b-%Y")
    date_path = 'Data/' + source_city + '-' + destination_city + '/' + date
    date_file_path = date_path + '/' + source_city + '_to_' + destination_city + "_" + date + '.csv'    
    try: 
        previous_data_date = pd.read_csv(date_file_path, index_col=['Bus Id', 'Time Extracted'])
        combined_data = pd.concat([previous_data_date, new_data])
        combined_data.to_csv(date_file_path, index=True)
    except:
        new_data.to_csv(date_file_path, index=True)
    
    # For complete data file
    complete_file_path = 'Data/Complete_Data.csv'
    try:
        previous_complete_data = pd.read_csv(complete_file_path, index_col=['Bus Id', 'Time Extracted'])
        combine_complete_data = pd.concat([previous_complete_data, new_data])
        combine_complete_data.to_csv(complete_file_path, index=True)
    except:
        new_data.to_csv(complete_file_path, index=True)
        
        
    

In [7]:
## Chrome Driver options
options = webdriver.ChromeOptions()
prefs = {"profile.default_content_setting_values.notifications" : 2}
options.add_experimental_option("prefs", prefs)
options.headless = True
driver = webdriver.Chrome("chromedriver", options = options)

base_url = "https://www.redbus.in/"

def search_data(source_city, destination_city, distance, extract_date, time_group="default", serial=0):
    driver.get(base_url)
    wait = WebDriverWait(driver, 10)

    # Entering search details 
    # Enter the source city
    source = driver.find_element_by_id('src')
    source_keys = source_city + " "
    source.send_keys(source_keys)
    source_select = wait.until(ec.element_to_be_clickable((By.XPATH, 
                                            "//*[@id='search']/div/div[1]/div/ul/li[1]")))
    source_select.click()
        
    # Enter the destination city
    destination = driver.find_element_by_id('dest')
    destination_keys = destination_city + " " 
    destination.send_keys(destination_keys) 
    destination_select = wait.until(ec.element_to_be_clickable((By.XPATH, 
                                            "//*[@id='search']/div/div[2]/div/ul/li[1]")))
    destination_select.click()

    
    # Get the current date 
    if serial == 0:        
        date = int(extract_date.strftime("%d"))
        date = date // 7 + 4
        
        day = int(extract_date.strftime("%w"))
        if day == 0:
            day = 7 
        date_path = "//*[@id='rb-calendar_onward_cal']/table/tbody/tr[" + str(date) +"]/td["+ str(day) + "]"
        
        #onward_calendar = wait.until(ec.element_to_be_clickable((By.CLASS_NAME, 'gtm-onwardCalender')))
        onward_date = wait.until(ec.element_to_be_clickable((By.XPATH, date_path)))
        onward_date.click()
    
    
    # Search for data 
    wait.until(ec.element_to_be_clickable((By.ID, 'search_btn'))).click()
    
    # Set weekend/weekday
    weekday = int(extract_date.strftime("%w"))
    day_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
    if weekday in [0,5,6]:
        weekend = "Weekend"
    else:
        weekend = "Weekday"
        
    
    # Extract data for route
    df_red_bus = extract_data(driver)
    if df_red_bus is not None:
        df_red_bus['From'] = source_city 
        df_red_bus['To'] = destination_city 
        df_red_bus['Distance'] = distance 
        df_red_bus['Data for Date'] = extract_date.strftime("%d %b %Y") 
        df_red_bus['Data for Day'] = day_name[weekday]
        df_red_bus['Time Group'] = time_group 
        df_red_bus['Weekday or Weekend'] = weekend
        df_red_bus['Seat Fare per km'] = df_red_bus['Seat Fare']/distance
        print(df_red_bus.head())
    
        '''
        # Route folder
        route_path = 'Data/' + source_city + '-' + destination_city 
        if not os.path.exists(route_path):
            os.mkdir(route_path)
        # Date folder
        save_path = route_path + '/' + extract_date.strftime("%d-%b-%Y")
        if not os.path.exists(save_path):
            os.mkdir(save_path)

        df_red_bus.to_csv(save_path +'/'+ source_city + '_to_'+ destination_city + "_at_time_" 
                          + time_group +'.csv', index=True)
        # Aggregate
        aggregate(source_city, destination_city, df_red_bus, extract_date, time_group)'''
    
    
    print("Fetching data for reverse route")
    
    # Extract data for Reverse route
    wait.until(ec.element_to_be_clickable((By.CLASS_NAME, "onward-modify-btn"))).click()
    wait.until(ec.element_to_be_clickable((By.CLASS_NAME, "icon-doublearrow"))).click()
    wait.until(ec.element_to_be_clickable((By.XPATH, "//button[contains(@class, 'ms-btn')]"))).click()
    
    
    df_red_bus_reverse = extract_data(driver)
    if df_red_bus_reverse is not None:
        df_red_bus_reverse['To'] = source_city 
        df_red_bus_reverse['From'] = destination_city 
        df_red_bus_reverse['Distance'] = distance
        df_red_bus_reverse['Data for Date'] = extract_date.strftime("%d %b %Y")
        df_red_bus_reverse['Data for Day'] = day_name[weekday]
        df_red_bus_reverse['Time Group'] = time_group 
        df_red_bus_reverse['Weekday or Weekend'] = weekend
        df_red_bus_reverse['Seat Fare per km'] = df_red_bus_reverse['Seat Fare']/distance
        print(df_red_bus_reverse.head())

        '''
        # Route folder
        route_path = 'Data/' + destination_city + '-' + source_city 
        if not os.path.exists(route_path):
            os.mkdir(route_path)
        # Date folder
        save_path_reverse = route_path + '/' + extract_date.strftime("%d-%b-%Y") 
        if not os.path.exists(save_path_reverse):
            os.mkdir(save_path_reverse)

        df_red_bus_reverse.to_csv(save_path_reverse +'/'+ destination_city + '_to_'+ source_city 
                                  + "_at_time_" + time_group +'.csv', index=True)


        ## Aggregating data
        aggregate(destination_city, source_city, df_red_bus_reverse, extract_date, time_group)'''
    
    
    return True
    

### Main scraper function 

In [None]:
serial = 0
def scraper(serial):
    
    # AT run time
    now = dt.datetime.now()
    time_group ="11:00 pm"
    
    # Set extraction date
    if now > now.replace(hour=22, minute=30) and now < now.replace(hour=23, minute=59):
        extract_date = dt.date.today() + dt.timedelta(days=1)
        print("Extracting data for date : " + extract_date.strftime("%d %b %Y"))
        time_group = "11:00 pm"
    else: 
        extract_date = dt.date.today()
        print("Extracting data for date : " + extract_date.strftime("%d %b %Y"))
        # Start pulling code between 5:30-6:30 and 8:30-9:30
        if now > now.replace(hour=17, minute=0) and now < now.replace(hour=19, minute=0):
            time_group = "6:00 pm"
        elif now > now.replace(hour=20, minute=0) and now < now.replace(hour=22, minute=0):
            time_group = "9:00 pm"
    
    
    ## Start extracting data
    ## Change the 'len(routes)' inside range() of 'for' loop to search 
    ## for particular routes
    for i in range(len(routes)):
        print("=========" + str(i+1) + " Searching data for route: " + routes['From'][i] + " to " + routes['To'][i] 
              + "=========")
        print("Distance : " + str(routes['Distance'][i]))
        data = search_data(routes['From'][i], routes['To'][i], routes['Distance'][i], extract_date, 
                           time_group, serial)
        print(str(i+1) + " Completed route: " + routes['From'][i] + " to " + routes['To'][i])
        serial += 1
    