In [92]:
import os

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select

os.environ['WDM_LOG_LEVEL'] = '0'


class browser:
    def __init__(self, URL=None, headless=True, no_image=True) -> object:
        self.url = URL
        self.headless = headless
        self.no_image = no_image
        self.select_elements = {}

    def setup(self):
        options = Options()
        if self.headless:
            options.add_argument('--headless')
        if self.no_image:
            prefs = {"profile.managed_default_content_settings.images": 2}
            options.add_experimental_option("prefs", prefs)
            self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
        elif not options:
            self.driver = webdriver.Chrome(ChromeDriverManager().install())
        print('starting the browser')
    
    def close(self):
        self.driver.quit()


In [1]:
import time,re
import os
from bs4 import BeautifulSoup as bs

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select

In [521]:
import unicodedata
import re
def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

def remove_none(L):
    return [x for x in L if x is not None]

class delhi_hc_search:
    def __init__(self, case_type, case_year, case_no=None, headless=True, no_image=True, delay=1, ret=False):
        """
        The Delhi High court website provides three ways to look for case data:
        1. Using Case type
        2. Using petitioner and respondents information
        3. Using Advocate information
        4. Using diary no. information

        This object provides a way to look for the bulk data using case type. Hence, to initialise we require
        three parameters

        case_type: string, an element in the list of options provided by the high court website that categorises a case
        case_no: int, case no. registered in the high court
        case_year: int, Year of registering the case in yyyy
        """

        self.url = "http://delhihighcourt.nic.in/case.asp"
        self.case_type = case_type
        self.case_no = case_no
        self.case_year = str(case_year)

        """
        Browser configuration
        Certain optimisations have been done to reduce the chrome overload which anyways is much higher. 
        Parameters i.e. headless, no_image
        """
        self.headless = headless
        self.no_image = no_image
        self.elements = {}

        """
        This is the meat of the scraper which tells the code where a specific piece of content can be found
        on the webpage.
        """
        self.elements['case_type'] = "//*[@id='InnerPageContent']/form[1]/select[1]"
        self.elements['case_type_elem'] = "//*[@id='InnerPageContent']/form[1]/select[1]"
        self.elements['case_no'] = "//*[@id='InnerPageContent']/form[1]/input[1]"
        self.elements['case_year_elem'] = "//*[@id='c_year']"
        self.elements['captcha_text_input'] = "//*[@id='inputdigit']"
        self.elements['captcha_text_value'] = "//*[@id='InnerPageContent']/form[1]/label[4]"
        self.elements['search_but'] = "//*[@id='InnerPageContent']/form[1]/button"

        # Keeping a global name for case_type_select element
        self.case_type_select = None
        self.case_type_options = None
        self.delay = delay
        self.ret = ret
        self.order_subpage_data = []

        # Assertions
        if self.case_no:
            assert type(self.case_no) == int, "Case no. must be an integer"
        assert (type(case_year) == int) & (len(str(case_year)) == 4), "Case no. must be an year in YYYY format"

    def _delay(self):
        # Spleep function that delays page load. Usually to avoid overloading the server
        time.sleep(self.delay)

    def get_case_type_options(self):
        # Fetches the case type options from the dropdown menue
        case_type_select = Select(self.driver.find_element_by_xpath(self.elements['case_type']))
        self.case_type_options = [(ind, opt.text) for ind, opt in enumerate(case_type_select.options)]
        # print(self.case_type_options)

    def get_search_page(self):
        # Calls the setup method to create a browser driver
        self.setup()
        # Fetch the URL for the delhi highcourt
        self.driver.get(self.url)
        # Fetch all the case type options from the
        self.get_case_type_options()

    def get_captcha_text(self):
        self.captcha_text_val = self.driver.find_element_by_xpath(self.elements['captcha_text_value'])
        self.captcha_text_val = self.captcha_text_val.text.split(" ")[2]

    def set_case_type(self):
        case_type_select = Select(self.driver.find_element_by_xpath(self.elements['case_type']))
        case_type_select.select_by_index(self.case_type)
        print(f'Selecting case type {self.case_type_options[self.case_type][1]}')

    def set_case_no(self):
        if self.case_no:
            case_type_select = self.driver.find_element_by_xpath(self.elements['case_no'])
            case_type_select.send_keys(self.case_no)
            print(f'Selecting case no {self.case_no}')

    def set_case_year(self):
        case_year_select = Select(self.driver.find_element_by_xpath(self.elements['case_year_elem']))
        case_year_select.select_by_value(self.case_year)

    def set_captcha_text(self):
        captcha_text = self.driver.find_element_by_xpath(self.elements['captcha_text_input'])
        captcha_text.send_keys(self.captcha_text_val)

    def set_param(self):
        self.get_search_page()
        self._delay()
        self.set_case_type()
        self._delay()
        self.set_case_no()
        self._delay()
        self.set_case_year()
        self._delay()
        self.get_captcha_text()
        self.set_captcha_text()

    def get_search_results(self):
        # Set search parameters
        self.set_param()

        # find search button and click on it
        search_but = self.driver.find_element_by_xpath(self.elements['search_but'])
        search_but.click()
        self.get_search_count()

    def start_scraping(self, order_links=[], pdf_links=[]):
        self.get_search_results()
        print(f'Total search results: {self.search_count}')
        self.page_visited = []
        self.page_not_visited = []
        # seed page no visited
        nav_links_curr_page = [element.get_attribute('href')
                               for element in
                               self.driver.find_elements_by_class_name('archivelink')]

        for link in nav_links_curr_page:
            self.page_not_visited.append(link)

        self.valid_links = remove_none(list(set(self.page_not_visited) - set(self.page_visited)))
        page_counter = 0
        data = []
        while self.valid_links:
            print(f'No. of pages_visited: {page_counter}',end='\r')
            self.driver.get(self.valid_links[0])
            self.page_visited.append(self.valid_links[0])
            current_page = self.driver.page_source
            # Get case status data parsed
            dat = self.parse_status_html(current_page)
            data.append(dat)
            soup = bs(current_page, 'html.parser')
            oj_details_elem = soup.find_all('button', {'class': 'button pull-right'})
            self.oj_details = [
                str(elem.get('onclick')).replace(str(elem.get('onclick'))[-1], '').replace('location.href=',
                                                                                           'http://delhihighcourt.nic.in/')
                for elem in oj_details_elem]

            main_window = self.driver.current_window_handle

            for i, oj in enumerate(self.oj_details):
                order_prefix = slugify(
                    dat['n_{}'.format(i)]['diary_no'])  # This will later be used to name all judgement files

                self.driver.execute_script("window.open('{}')".format(oj))

                # Assigning oj_window the current window tab put the main window in the variable. Trying to assign
                # correct value
                oj_window = list((set(self.driver.window_handles) - set([main_window])))[0]

                self.driver.switch_to.window(oj_window)
                order_subpage = self.driver.page_source
                soup = bs(order_subpage, 'html.parser')
                self.order_subpage_data.append(self.parse_orders_page(order_subpage))
                oj_subp_elem = soup.find_all('button', {'class': 'LongCaseNoBtn'})
                oj_subp_details = [str(elem.get('onclick')) for elem in oj_subp_elem]

                # Clean string and create a list
                for string in oj_subp_details:
                    string = string.replace('location.href=', '')
                    if string.startswith('\'') and string.endswith('\''):
                        string = string[1:-1]
                    order_links.append(string)

                # get link to pdf_pages
                for order in order_links:
                    pdf_links.append(order)
                
                self.download_pdfs(pdf_links,order_prefix,oj_window,main_window)

                self.driver.close()
                self.driver.switch_to.window(main_window)

            nav_links_curr_page = [element.get_attribute('href')
                                   for element in
                                   self.driver.find_elements_by_class_name('archivelink')]
            for link in nav_links_curr_page:
                if link not in self.page_visited:
                    self.page_not_visited.append(link)

            self.valid_links = remove_none(list(set(self.page_not_visited) - set(self.page_visited)))
            page_counter += 1

        self.scraped_data = pd.concat([pd.DataFrame(d).T for d in data])
        if not self.ret:
            return None
        else:
            return data

    def download_pdfs(self,pdf_links,order_prefix,oj_window,main_window):
        order_count = 0
        for link in pdf_links:
            self.driver.execute_script("window.open('{}')".format(link))
            pdf_win_handle = list(set(self.driver.window_handles)-set([oj_window]) - set([main_window]))[0]
            self.driver.switch_to.window(pdf_win_handle)

            url = self.driver.find_element_by_xpath("/html/body/iframe").get_attribute('src')
            urllib.request.urlretrieve(url, f"{order_prefix}_{order_count}.pdf")
            order_count += 1
            self.driver.close()
            self.driver.switch_to.window(oj_window)
        
    
    def clean_text(self, text):
        text = text.replace(u'\xa0', u'').replace('\n', '')
        text = re.sub(' +', ' ', re.sub('\t+', '\t', text))
        text = text.lstrip().rstrip()
        return text

    def merge_alternate_list(self, lst1, lst2):
        return [sub[item] for item in range(len(lst2))
                for sub in [lst1, lst2]]

    def parse_status_html(self, html):
        soup = bs(html, 'html.parser')
        li_odd = soup.find_all('li', {"class": "clearfix odd"})
        li_even = soup.find_all('li', {"class": "clearfix even"})
        li = self.merge_alternate_list(li_odd, li_even)
        case_status_data = {}
        for i in range(len(li)):
            s0 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[0].get_text())
            sr_no = s0.replace(".", "")

            s1 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[1].get_text())
            case_status1 = re.findall(r'\[.*?\]', s1)[0]
            diary_no = s1.split(case_status1)[0].replace('\t', '')

            s2 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[2].get_text())
            advocate = s2.split('Advocate :')[-1]
            petitioner, respondent = s2.split('Advocate :')[0].split('Vs.')

            s3 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[3].get_text())
            try:
                court_no, case_status2, judgement_date = re.findall('^\D*(\d)\s(\D*)\s(\d+/\d+/\d+)', s3)[0]
            except:
                court_no, case_status2, judgement_date = "", "", ""

            case_status_data[f'n_{i}'] = {}
            case_status_data[f'n_{i}']['sr_no'] = sr_no
            case_status_data[f'n_{i}']['diary_no'] = diary_no
            case_status_data[f'n_{i}']['case_status1'] = case_status1
            case_status_data[f'n_{i}']['petitioner'] = petitioner
            case_status_data[f'n_{i}']['respondent'] = respondent
            case_status_data[f'n_{i}']['advocate'] = advocate
            case_status_data[f'n_{i}']['court_no'] = court_no
            case_status_data[f'n_{i}']['case_status2'] = case_status2
            case_status_data[f'n_{i}']['judgement_date'] = judgement_date
        return case_status_data

    def get_order_page_links(self, html):
        oj_details_elem = soup.find_all('button', {'class': 'button pull-right'})
        oj_details = [str(elem.get('onclick')).replace(str(elem.get('onclick'))[-1], '').replace('location.href=',
                                                                                                 'http://delhihighcourt.nic.in/')
                      for elem in oj_details_elem]

    def get_search_count(self):
        self.search_count = self.driver.find_element_by_xpath("//*[@id='InnerPageContent']/span").text
        self.search_count = int(self.search_count.split(":")[-1].lstrip().rstrip())

    def parse_orders_page(self, html):
        soup = bs(html, 'html.parser')
        li = soup.find_all('li', {"class": "clearfix odd"})

        orders = {}
        for i in range(len(li)):
            """
            The Delhi High court website essentially has a a case status page and a button
            which gives details of all the orders related to that case. This function
            parses the page to provide structured data from the page. Variables include
            serial no, date of order, corrigendum text

            """
            s0 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[0].get_text())
            sr_no = s0.replace(".", "")

            s1 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[1].get_text())
            case_no = s1

            s2 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[2].get_text())
            date_of_order = s2

            s3 = self.clean_text(li[i].select('span', attrs={'class': re.compile('^title*')})[3].get_text())
            corrigendum = s3

            orders[f'n_{i + 1}'] = {}
            orders[f'n_{i + 1}']['sr_no'] = sr_no
            orders[f'n_{i + 1}']['case_no'] = case_no
            orders[f'n_{i + 1}']['date_of_order'] = date_of_order
            orders[f'n_{i + 1}']['corrigendum'] = corrigendum
        return orders

    def setup(self):
        options = Options()
        if self.headless:
            options.add_argument('--headless')

        options.add_argument("--disable-xss-auditor")
        options.add_argument("--disable-web-security")
        options.add_argument("--allow-running-insecure-content")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-setuid-sandbox")
        options.add_argument("--disable-webgl")
        options.add_argument("--ignore-certificate-errors")
        options.add_argument("--disable-popup-blocking")
        profile = {"plugins.plugins_list": [{"enabled": False, "name": "Chrome PDF Viewer"}],
                   # Disable Chrome's PDF Viewer
                   "download.extensions_to_open": "applications/pdf"}
        options.add_experimental_option("prefs", profile)

        if self.no_image:
            prefs = {"profile.managed_default_content_settings.images": 2}
            options.add_experimental_option("prefs", prefs)
            self.driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
        elif not options:
            self.driver = webdriver.Chrome(ChromeDriverManager().install())
        print('Browser setup complete')

    def close(self):
        self.driver.quit()


In [523]:
from soscipy.utilities import progress_bar

In [530]:
progress_bar.update_progress(0.1)

Percent Completion: [#---------] 10.00% 

## function -> Start scraping

In [522]:
delhi = delhi_hc_search(case_type=4,case_no=None,case_year=2020,headless=False,delay=0)
delhi.start_scraping()







[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/Users/saurabhkarn/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Browser setup complete
Selecting case type BAIL APPLN. - [BAILA]
Total search results: 4244


KeyboardInterrupt: 

In [511]:
a - b - c

{1}

In [476]:
#Close the driver
delhi.close()

In [447]:
#Step1: Reach the first page
delhi.get_search_results()
print(f'Total search results: {delhi.search_count}')







[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/Users/saurabhkarn/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Browser setup complete
Selecting case type BAIL APPLN. - [BAILA]
Total search results: 4244


In [None]:
while self.valid_links:
    self.driver.get(self.valid_links[0])
    self.page_visited.append(self.valid_links[0])
    current_page = self.driver.page_source
    #Get case status data parsed
    dat = self.parse_status_html(current_page)
    data.append(dat)
    order_prefix = slugify(dat['n_{}'.format(0)]['diary_no']) #This will later be used to name all judgement files
    order_count = 0
    soup = bs(current_page,'html5lib')
    oj_details_elem = soup.find_all('button',{'class':'button pull-right'})
    oj_details = [str(elem.get('onclick')).replace(str(elem.get('onclick'))[-1],'').replace('location.href=','http://delhihighcourt.nic.in/')
                    for elem in oj_details_elem]

    main_window = delhi.driver.current_window_handle
    
    for oj_det in oj_details:
        delhi.driver.execute_script("window.open('{}')".format(oj_det))
        delhi.driver.switch_to.window(delhi.driver.window_handles[1])
        order_subpage = delhi.driver.page_source
        soup = bs(order_subpage,'html5lib')
        oj_subp_elem = soup.find_all('button',{'class':'LongCaseNoBtn'})
        oj_subp_details = [str(elem.get('onclick')) for elem in oj_subp_elem]
        order_links=[]

        #Clean string and create a list
        for string in oj_subp_details:
            string = string.replace('location.href=','')
            if string.startswith('\'') and string.endswith('\''):
                string = string[1:-1]
            order_links.append(string)

        #get link to pdf_pages
        for order in order_links:
            pdf_links.append(order)

        for link in pdf_links:
            delhi.driver.execute_script("window.open('{}')".format(link))
            window_handles = delhi.driver.window_handles
            delhi.driver.switch_to.window(window_handles[len(window_handles)-1])
            url = delhi.driver.find_element_by_xpath("/html/body/iframe").get_attribute('src')
            urllib.request.urlretrieve(url, f"{order_prefix}_{order_count}.pdf")
            delhi.driver.close()
            delhi.driver.switch_to.window(delhi.driver.window_handles[0])
            
        delhi.driver.close()
        delhi.driver.switch_to.window(main_window)
            

            
            #Get order's link
            
            #get information of orders
            
            #get links to PDFs
            
            #save PDFs in a folder
            
            #Get other navigation links
            nav_links_curr_page = [element.get_attribute('href') 
                           for element in 
                           self.driver.find_elements_by_class_name('archivelink')]
            for link in nav_links_curr_page:
                if link not in self.page_visited:
                    self.page_not_visited.append(link)
            self.valid_links = remove_none(list(set(self.page_not_visited) - set(self.page_visited)))
            page_counter += 1
            if page_counter == 3:
                break

In [451]:
html = delhi.driver.page_source
dat = delhi.parse_status_html(html)

In [466]:
order_prefix = slugify(dat['n_{}'.format(0)]['diary_no'])

In [467]:
#Parse the current page for judgement link
#dat = delhi.parse_status_html(html)
soup = bs(html,'html5lib')
oj_details_elem = soup.find_all('button',{'class':'button pull-right'})
oj_details = [str(elem.get('onclick')).replace(str(elem.get('onclick'))[-1],'').replace('location.href=','http://delhihighcourt.nic.in/')
                for elem in oj_details_elem]

In [468]:
#Open the first judgement link
delhi.driver.execute_script("window.open('{}')".format(oj_details[0]))

In [469]:
#Switch to oder window
delhi.driver.switch_to.window(delhi.driver.window_handles[1])

In [470]:
order_subpage = delhi.driver.page_source
soup = bs(order_subpage,'html5lib')

In [471]:
oj_subp_elem = soup.find_all('button',{'class':'LongCaseNoBtn'})
oj_subp_details = [str(elem.get('onclick')) for elem in oj_subp_elem]
order_links=[]

#Clean string and create a list
for string in oj_subp_details:
    string = string.replace('location.href=','')
    if string.startswith('\'') and string.endswith('\''):
        string = string[1:-1]
    order_links.append(string)

#get link to pdf_pages
for order in order_links:
    pdf_links.append(order)

In [472]:
delhi.driver.switch_to.window(delhi.driver.window_handles[1])
html = delhi.driver.page_source
soup = bs(html,'html5lib')
oj_details_elem = soup.find_all('button',{'class':'LongCaseNoBtn'})
oj_details = [str(elem.get('onclick')) for elem in oj_details_elem]

order_links=[]
for string in oj_details:
    string = string.replace('location.href=','')
    if string.startswith('\'') and string.endswith('\''):
        string = string[1:-1]
    order_links.append(string)

for order in order_links:
    pdf_links.append(order)

In [None]:
link = pdf_links[0]
delhi.driver.execute_script("window.open('{}')".format(link))
window_handles = delhi.driver.window_handles
delhi.driver.switch_to.window(window_handles[len(window_handles)-1])

url = delhi.driver.find_element_by_xpath("/html/body/iframe").get_attribute('src')
urllib.request.urlretrieve(url, f"{order_prefix}_{c}.pdf")

In [442]:
delhi.driver.close()

In [437]:
c = 0
for link in pdf_links:
    delhi.driver.execute_script("window.open('{}')".format(link))
    window_handles = delhi.driver.window_handles
    delhi.driver.switch_to.window(window_handles[len(window_handles)-1])
    url = delhi.driver.find_element_by_xpath("/html/body/iframe").get_attribute('src')
    urllib.request.urlretrieve(url, f"filename_{c}.pdf")
    c += 1
    delhi.driver.close()
    delhi.driver.switch_to.window(delhi.driver.window_handles[0])

In [None]:
#Step2: Seed a list of valid navigation link
delhi.driver.page_visited = []
delhi.driver.page_not_visited = []
#seed page no visited
nav_links_curr_page = [element.get_attribute('href') 
                       for element in 
                       delhi.driver.driver.find_elements_by_class_name('archivelink')]

for link in nav_links_curr_page:
    delhi.driver.page_not_visited.append(link)

delhi.driver.valid_links = remove_none(list(set(delhi.driver.page_not_visited) - set(delhi.driver.page_visited)))

In [None]:
#Step 3: Loop over the valid links. For testing only the first page is being called
self.driver.get(self.valid_links[0])
self.page_visited.append(self.valid_links[0])
current_page = self.driver.page_source
#Get case status data parsed
data.append(self.parse_status_html(current_page))


In [None]:
page_counter = 0
data = []
while self.valid_links:
    self.driver.get(self.valid_links[0])
    self.page_visited.append(self.valid_links[0])
    current_page = self.driver.page_source
    #Get case status data parsed
    data.append(self.parse_status_html(current_page))

    #Get order's link

    #get information of orders

    #get links to PDFs

    #save PDFs in a folder

    #Get other navigation links
    nav_links_curr_page = [element.get_attribute('href') 
                   for element in 
                   self.driver.find_elements_by_class_name('archivelink')]
    for link in nav_links_curr_page:
        if link not in self.page_visited:
            self.page_not_visited.append(link)
    self.valid_links = remove_none(list(set(self.page_not_visited) - set(self.page_visited)))
    page_counter += 1
    if page_counter == 3:
        break


self.scraped_data = pd.concat([pd.DataFrame(d).T for d in data])
if not self.ret:
    return None
else:
    return data

In [323]:
delhi.close()

In [410]:
delhi = delhi_hc_search(case_type=4,case_no=None,case_year=2020,headless=False,delay=0)

In [411]:
data = delhi.start_scraping()

[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430






[WDM] - Driver [/Users/saurabhkarn/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache


Browser setup complete
Selecting case type BAIL APPLN. - [BAILA]
Total search results: 4244


In [390]:
#dat = delhi.parse_status_html(html)
html = delhi.driver.page_source
soup = bs(html,'html5lib')
oj_details_elem = soup.find_all('button',{'class':'button pull-right'})
oj_details = [str(elem.get('onclick')).replace(str(elem.get('onclick'))[-1],'').replace('location.href=','http://delhihighcourt.nic.in/')
                for elem in oj_details_elem]

In [391]:
for link in oj_details:
    delhi.driver.execute_script("window.open('{}')".format(link))

In [None]:
def get_order_pdf():

In [392]:
windows = delhi.driver.window_handles
c = 0
pdf_links = []
for win in windows[1:]:
    delhi.driver.switch_to.window(win)
    html = delhi.driver.page_source
    soup = bs(html,'html5lib')
    oj_details_elem = soup.find_all('button',{'class':'LongCaseNoBtn'})
    oj_details = [str(elem.get('onclick')) for elem in oj_details_elem]

    order_links=[]
    for string in oj_details:
        string = string.replace('location.href=','')
        if string.startswith('\'') and string.endswith('\''):
            string = string[1:-1]
        order_links.append(string)
    
    for order in order_links:
        pdf_links.append(order)        
    
    delhi.driver.switch_to.window(windows[0])
    delhi.driver.close()
    
for link in pdf_links:
    delhi.driver.execute_script("window.open('{}')".format(link))
    window_handles = delhi.driver.window_handles
    delhi.driver.switch_to.window(window_handles[len(window_handles)-1])
    url = delhi.driver.find_element_by_xpath("/html/body/iframe").get_attribute('src')
    urllib.request.urlretrieve(url, f"filename_{c}.pdf")
    c += 1
    delhi.driver.close()
    delhi.driver.switch_to.window(delhi.driver.window_handles[0])

In [397]:
delhi.driver.window_handles

['CDwindow-95098CBFD64872591C5043BCCCFB54DD']

In [399]:
delhi.driver.switch_to.window(delhi.driver.window_handles[0])
delhi.driver.execute_script("window.open('{}')".format(link))

In [402]:
delhi.driver.close()
delhi.driver.switch_to.window(delhi.driver.window_handles[0])

In [403]:
for link in pdf_links:
    delhi.driver.execute_script("window.open('{}')".format(link))
    window_handles = delhi.driver.window_handles
    delhi.driver.switch_to.window(window_handles[len(window_handles)-1])
    url = delhi.driver.find_element_by_xpath("/html/body/iframe").get_attribute('src')
    urllib.request.urlretrieve(url, f"filename_{c}.pdf")
    c += 1
    delhi.driver.close()
    delhi.driver.switch_to.window(delhi.driver.window_handles[0])

In [388]:
delhi.driver.quit()

In [335]:
windows = delhi.driver.window_handles
delhi.driver.switch_to.window(windows[1])
delhi.driver.close()

In [321]:
delhi.driver.switch_to.window(windows[0])

In [193]:
for i in range(1,len(windows)):
    time.sleep(5)
    delhi.driver.switch_to.window(windows[i])

In [204]:
delhi.driver.switch_to.window(windows[4])

In [316]:
html = delhi.driver.page_source
soup = bs(html,'html5lib')
oj_details_elem = soup.find_all('button',{'class':'LongCaseNoBtn'})
oj_details = [str(elem.get('onclick')) for elem in oj_details_elem]

order_links=[]
for string in oj_details:
    string = string.replace('location.href=','')
    if string.startswith('\'') and string.endswith('\''):
        string = string[1:-1]
    order_links.append(string)

for order in order_links:
    print(order)

In [310]:
delhi.driver.execute_script("window.open('{}')".format(x[0]))

In [288]:
windows = delhi.driver.window_handles
delhi.driver.switch_to.window(windows[1])

In [301]:
url = delhi.driver.find_element_by_xpath("/html/body/iframe").get_attribute('src')

In [303]:
import urllib.request
urllib.request.urlretrieve(url, "filename.pdf")

('filename.pdf', <http.client.HTTPMessage at 0x7fb294916c10>)

In [None]:
soup

In [279]:
pdf_elem = '/html/body/pdf-viewer//div/div[2]/div[1]/div[2]/embed'

In [274]:
download_pdf(x[0])



Downloading file from link: http://delhihighcourt.nic.in/dhcqrydisp_o.asp?pn=82812&yr=2020




[WDM] - Current google-chrome version is 90.0.4430
[WDM] - Get LATEST driver version for 90.0.4430
[WDM] - Driver [/Users/saurabhkarn/.wdm/drivers/chromedriver/mac64/90.0.4430.24/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install(),chrome_options = options)


File: test_file
Status: Download Complete.
Folder: /Users/saurabhkarn


In [None]:
html = delhi.driver.page_source
soup = bs(html,'html5lib')
oj_details_elem = soup.find_all('plugin',{'class':'LongCaseNoBtn'})

In [205]:
delhi.parse_orders_page(delhi.driver.page_source)

{'n_1': {'sr_no': '1',
  'case_no': 'BAIL APPLN. 973/2020',
  'date_of_order': '09/06/2020',
  'corrigendum': ''},
 'n_2': {'sr_no': '2',
  'case_no': 'BAIL APPLN. 973/2020',
  'date_of_order': '08/06/2020',
  'corrigendum': ''},
 'n_3': {'sr_no': '3',
  'case_no': 'BAIL APPLN. 973/2020',
  'date_of_order': '05/06/2020',
  'corrigendum': ''},
 'n_4': {'sr_no': '4',
  'case_no': 'BAIL APPLN. 973/2020',
  'date_of_order': '28/05/2020',
  'corrigendum': ''},
 'n_5': {'sr_no': '5',
  'case_no': 'BAIL APPLN. 973/2020',
  'date_of_order': '21/05/2020',
  'corrigendum': ''}}