# Imports

In [30]:
#Imports
import requests
from urllib.parse import urlparse, urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

# Set to uniquely store internal and external urls

In [31]:

internal_urls = set()
external_urls = set()

# Function to check if the URL is valid

In [32]:
def is_url_valid(url):
   #Checks if the url is valid or not
    parsedURL = urlparse(url)
    return bool(parsedURL.netloc) and bool(parsedURL.scheme)

#The above function works as follows. We check if the scheme is present and there is a value in the network location part
# url = "https://umd.edu/virusinfo"
# urlparse(url)
# ParseResult(scheme='https', netloc='umd.edu', path='/virusinfo', 
# params='', query='', fragment='')

# Function to Get all URLs

In [33]:

#The function gives all urls
def get_all_urls(url):
    urls = set()
    # domain name of the URL without the protocol (umd.edu in this case)
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            #href is empty and we don't need that a element
            continue
        #if the link is not absolute, make it by joining relative to the base
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        #constructing an absolute URL from parsed data
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_url_valid(href):
            #in valid url
            continue
        if href in internal_urls:
            #it is already in the set, so we don't need to add
            continue
        if domain_name not in href:
            #it is an external link. i.e
            # Check if it is already there 
            if href not in external_urls:
                print(f"[EXT] External link: {href}")
                external_urls.add(href)
            continue
        print(f"[INT] Internal link: {href}")
        urls.add(href)
        internal_urls.add(href)
    return urls


# Recursive function for crawling

In [34]:
total_urls_visited = 0
def crawl(url, max_urls=50):
    #Max URL is just to decrease the time if there are a lot of pages.
    #The following code was openly available of github and I found this
    #idea useful to inhibit crawling time
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_urls(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)


# Setting the base URL

In [35]:
#base_url = "https://umd.edu/virusinfo"
#Input
base_url = input("Enter the URL : ")
# base_url = "https://www.ucf.edu/coronavirus/"

Enter the URL : https://umd.edu/virusinfo


# Splitting URL

In [36]:
parsedurl = urlparse(base_url)
# base_url_text = base_url.split("//",1)[1]
base_url_text = parsedurl.netloc+parsedurl.path
#print(base_url_text)
# base_url_text_domain = base_url_text.split("/",1)[0]
base_url_text_domain = parsedurl.netloc
#print(base_url_text_domain)

# Crawler

In [37]:
#Crawler call
crawl(base_url)
print("[+] Total External links:", len(external_urls))
print("[+] Total Internal links:", len(internal_urls))
print("[+] Total:", len(external_urls) + len(internal_urls))

[INT] Internal link: https://umd.edu/virusinfo
[INT] Internal link: https://umd.edu/
[EXT] External link: https://www.youtube.com/user/UMD2101
[EXT] External link: https://www.facebook.com/UnivofMaryland
[EXT] External link: https://twitter.com/UofMaryland
[EXT] External link: https://www.instagram.com/univofmaryland/
[INT] Internal link: https://umd.edu/virusinfo/campus-guidance
[INT] Internal link: https://umd.edu/virusinfo/stay-healthy
[INT] Internal link: https://umd.edu/virusinfo/communications-news
[INT] Internal link: https://umd.edu/virusinfo/confirmed-cases
[INT] Internal link: https://umd.edu/virusinfo/campus-resources
[INT] Internal link: https://umd.edu/virusinfo/help-students-crisis
[INT] Internal link: https://umd.edu/virusinfo/contact-information
[EXT] External link: https://www.cdc.gov/coronavirus/2019-ncov/index.html
[EXT] External link: https://phpa.health.maryland.gov/Pages/Novel-coronavirus.aspx
[INT] Internal link: https://umd.edu/covid-19-updates-and-resources/cas

[INT] Internal link: http://www.cmns.umd.edu/undergraduate
[INT] Internal link: http://www.cmns.umd.edu/graduate
[INT] Internal link: http://www.cmns.umd.edu/alumni-friends
[INT] Internal link: http://www.cmns.umd.edu/faculty-staff
[EXT] External link: https://advancement.umd.edu/giving/showschool.php
[INT] Internal link: http://www.cmns.umd.edu/about-cmns
[INT] Internal link: http://www.cmns.umd.edu/departments
[INT] Internal link: http://www.cmns.umd.edu/research
[INT] Internal link: http://www.cmns.umd.edu/news-events/news
[EXT] External link: https://cmns.umd.edu/news-events/features/4617
[EXT] External link: https://cmns.umd.edu/news-events/features/4587
[EXT] External link: https://cmns.umd.edu/news-events/news/coronavirus
[EXT] External link: https://cmns.umd.edu/news-events/features/4575
[EXT] External link: https://cmns.umd.edu/news-events/features/4561
[INT] Internal link: http://www.cmns.umd.edu/undergraduate/majors-minors
[INT] Internal link: http://www.cmns.umd.edu/undergr

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


[EXT] External link: http://www.umdrightnow.umd.edu/university-maryland-rankings
[EXT] External link: http://careers.umd.edu/careers4terps
[EXT] External link: mailto://ucc-recruiting@umd.edu
[EXT] External link: http://careers.umd.edu/employers/recruit-campus
[EXT] External link: http://careers.umd.edu/employers/hire-students-alumni
[EXT] External link: http://careers.umd.edu/
[EXT] External link: http://www-math.umd.edu/undergraduate/resources.html
[EXT] External link: https://www-math.umd.edu/testbank.html
[EXT] External link: http://www.resnet.umd.edu/programs/math_success/
[EXT] External link: https://calendly.com/tab6ep/math-learning-program/
[EXT] External link: http://www2.chem.umd.edu/axe
[EXT] External link: https://www.tutoring.umd.edu/chemistry.html
[EXT] External link: http://umdphysics.umd.edu/academics/academic-support.html
[EXT] External link: http://www.counseling.umd.edu/LAS
[EXT] External link: https://tltc.umd.edu/learn/
[INT] Internal link: http://www.cmns.umd.edu/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


[EXT] External link: http://www2.chem.umd.edu/umd-nobcche/
[EXT] External link: http://umd.orgsync.com/show_profile/64632-society-for-the-advancement-of-chicanos-and-native-americans-in-science-sacnas-chapter
[EXT] External link: http://www2.chem.umd.edu/gsohome/index.php
[EXT] External link: http://entomology.umd.edu/entomology-student-organization.html
[EXT] External link: http://gradmap.astro.umd.edu/
[EXT] External link: http://gsg.umd.edu/
[EXT] External link: http://umd.orgsync.com/org/gsg/n_gsgevents
[EXT] External link: http://thestamp.umd.edu/student_involvement/graduate_student_life
[EXT] External link: https://commencement.umd.edu/speakers-honorees/student-speaker
[EXT] External link: https://commencement.umd.edu/
[INT] Internal link: http://www.cmns.umd.edu/news-events/features/4584
[EXT] External link: https://commencement.umd.edu/school-college-virtual-ceremonies
[EXT] External link: https://umdsurvey.umd.edu/jfe/form/SV_b1I1AyBXKrm5LFP
[EXT] External link: http://registr

[EXT] External link: http://cmns.umd.edu/news-events/features/4566
[EXT] External link: http://cmns.umd.edu/taxonomy/term/734/all/feed
[EXT] External link: http://cmns.umd.edu/news-events/features/4606
[EXT] External link: http://cmns.umd.edu/news-events/features/4600
[EXT] External link: http://cmns.umd.edu/news-events/features/4599
[EXT] External link: http://cmns.umd.edu/news-events/features/4564
[EXT] External link: http://cmns.umd.edu/taxonomy/term/724/all/feed
[EXT] External link: http://cmns.umd.edu/news-events/features/4434
[EXT] External link: http://cmns.umd.edu/news-events/features/4404
[EXT] External link: http://cmns.umd.edu/news-events/features/4403
[EXT] External link: http://cmns.umd.edu/news-events/features/4410
[EXT] External link: http://cmns.umd.edu/news-events/features/4411
[EXT] External link: http://cmns.umd.edu/news-events/features/4374
[EXT] External link: http://cmns.umd.edu/news-events/features/4369
[EXT] External link: http://cmns.umd.edu/news-events/feature

[EXT] External link: https://youtu.be/byNP1Qt6nYE
[EXT] External link: http://cmns.umd.edu/news-events/features/4408
[EXT] External link: http://cmns.umd.edu/news-events/features/4407
[EXT] External link: http://cmns.umd.edu/news-events/features/4413
[EXT] External link: http://cmns.umd.edu/news-events/features/4400
[EXT] External link: http://cmns.umd.edu/news-events/features/4399
[EXT] External link: http://www.umiacs.umd.edu/about-us/news/hajiaghayi-wins-prestigious-guggenheim-fellowship
[EXT] External link: http://cmns.umd.edu/taxonomy/term/61/all/feed
[EXT] External link: http://cmns.umd.edu/news-events/features/4570
[EXT] External link: http://cmns.umd.edu/news-events/features/4526
[EXT] External link: http://cmns.umd.edu/news-events/features/4513
[EXT] External link: http://cmns.umd.edu/news-events/features/4489
[EXT] External link: http://cmns.umd.edu/news-events/features/4488
[EXT] External link: http://cmns.umd.edu/news-events/features/4426
[EXT] External link: http://cmns.um

[INT] Internal link: http://www.cmns.umd.edu/news-events/features/2466
[EXT] External link: https://www.umiacs.umd.edu
[EXT] External link: https://umd.box.com/shared/static/zu0uo17kxk3zc5me3ghap5wp0v9gi69t.jpg
[EXT] External link: http://commerce.maryland.gov/fund/maryland-e-nnovation-initiative-fund-(meif)
[EXT] External link: http://visisonics.com
[EXT] External link: https://www.cs.umd.edu/people/samirk
[EXT] External link: mailto://mewright@umd.edu
[INT] Internal link: http://www.cmns.umd.edu
[INT] Internal link: http://www.cmns.umd.edu/other-tags/elizabeth-iribe
[INT] Internal link: http://www.cmns.umd.edu/other-tags/brendan-iribe
[INT] Internal link: http://www.cmns.umd.edu/other-tags/matthias-zwicker
[INT] Internal link: http://www.cmns.umd.edu/other-tags/reginald-allan-hahne
[INT] Internal link: http://www.cmns.umd.edu/taxonomycmnstagsxmlxml/augmentedvirtual-reality
[EXT] External link: http://cmns.umd.edu/news-events/features/4132
[EXT] External link: http://cmns.umd.edu/news

# Screenshot following pages

In [38]:
count = 0
immediate_urls = [] #Linked Associated with the current page, https://umd.edu/virusinfo/ in this case

for url in internal_urls:
    if base_url_text in url:
        immediate_urls.append(url)

for immediate_url in immediate_urls:
    count = count+1
    print(f'{immediate_url}\n')

print(count)

https://umd.edu/virusinfo

https://umd.edu/virusinfo/contact-information

https://umd.edu/virusinfo/academic-faqs

https://umd.edu/virusinfo/campus-guidance

https://umd.edu/virusinfo/stay-healthy

https://umd.edu/virusinfo/communications-news

https://umd.edu/virusinfo/campus-resources

https://umd.edu/virusinfo/confirmed-cases

https://umd.edu/virusinfo/help-students-crisis

9


# Automation

In [39]:
count = 0
name_of_folder = f'{base_url_text_domain}_Screenshots'
for i in immediate_urls:
    options = webdriver.ChromeOptions()
    options.headless = True
    driver = webdriver.Chrome(options=options,executable_path='chromedriver.exe') # #Local Path of chrome driver
    url = i
    file_name = url.replace(base_url,'')
    file_name = file_name.replace('/','')
    print(f'Visiting {base_url_text}/{file_name}')
    print(f'...Taking a screenshot')
    driver.get(url)
    if not os.path.exists(name_of_folder):
        os.makedirs(name_of_folder)
    S = lambda X: driver.execute_script('return document.body.parentNode.scroll'+X)
    driver.set_window_size(S('Width'),S('Height')) # May need manual adjustment                                                                                                                
    driver.find_element_by_tag_name('body').screenshot(f'{name_of_folder}\\{base_url_text_domain}-{file_name}.png')
    print(f'Screenshot of {file_name} page taken! \n')
    driver.quit()

print(f'Task Completed! Files stored in the {name_of_folder} Folder')

Visiting umd.edu/virusinfo/
...Taking a screenshot
Screenshot of  page taken! 

Visiting umd.edu/virusinfo/contact-information
...Taking a screenshot
Screenshot of contact-information page taken! 

Visiting umd.edu/virusinfo/academic-faqs
...Taking a screenshot
Screenshot of academic-faqs page taken! 

Visiting umd.edu/virusinfo/campus-guidance
...Taking a screenshot
Screenshot of campus-guidance page taken! 

Visiting umd.edu/virusinfo/stay-healthy
...Taking a screenshot
Screenshot of stay-healthy page taken! 

Visiting umd.edu/virusinfo/communications-news
...Taking a screenshot
Screenshot of communications-news page taken! 

Visiting umd.edu/virusinfo/campus-resources
...Taking a screenshot
Screenshot of campus-resources page taken! 

Visiting umd.edu/virusinfo/confirmed-cases
...Taking a screenshot
Screenshot of confirmed-cases page taken! 

Visiting umd.edu/virusinfo/help-students-crisis
...Taking a screenshot
Screenshot of help-students-crisis page taken! 

Task Completed! Files 

In [28]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')