In [1]:
# YouTube Link:

# Ensure that you have both beautifulsoup and requests installed:
#   pip install beautifulsoup4
#   pip install requests

import requests
from bs4 import BeautifulSoup

In [2]:
# Using the requests module, we use the "get" function provided to access the webpage provided as an argument to this function:
result = requests.get("https://www.google.com/")

# Make sure that the website is accessible, we can ensure that we obtain a 200 OK response 
#to indicate that the page is indeed present:
print(result.status_code)


200


In [3]:
# For other potential status codes you may encounter, consult the following Wikipedia page:
# https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

# We can also check the HTTP header of the website to verify that we have indeed accessed the correct page:
print(result.headers)

{'Date': 'Sun, 21 Apr 2019 12:20:18 GMT', 'Expires': '-1', 'Cache-Control': 'private, max-age=0', 'Content-Type': 'text/html; charset=ISO-8859-1', 'P3P': 'CP="This is not a P3P policy! See g.co/p3phelp for more info."', 'Content-Encoding': 'gzip', 'Server': 'gws', 'X-XSS-Protection': '0', 'X-Frame-Options': 'SAMEORIGIN', 'Set-Cookie': '1P_JAR=2019-04-21-12; expires=Tue, 21-May-2019 12:20:18 GMT; path=/; domain=.google.com, NID=181=kqxLLmeA4snhkG2-jg-OJEssgDqtXRqdMLcP881xrz13MudtbpYy8xVELenNz1X_5EfEWPiQAbPhZOhVRZM7sD8kJuvHezL-GDm9-xMSLB8l_HWRucJj1ThjMFUkXBFNeXxWC3StK6Rgoi-yTMAdSN0YM0vOReLoyFpW_aVjdqY; expires=Mon, 21-Oct-2019 12:20:18 GMT; path=/; domain=.google.com; HttpOnly', 'Alt-Svc': 'quic=":443"; ma=2592000; v="46,44,43,39"', 'Transfer-Encoding': 'chunked'}


In [4]:
# For more information on HTTP headers and the information one can obtain from them, you may consult:
# https://en.wikipedia.org/wiki/List_of_HTTP_header_fields

# Now, let us store the page content of the website accessed from requests to a variable:
src = result.content

# Now that we have the page source stored, we will use the
# BeautifulSoup module to parse and process the source.
# To do so, we create a BeautifulSoup object based on the
# source variable we created above:
soup = BeautifulSoup(src, 'lxml')

# Now that the page source has been processed via Beautifulsoup
# we can access specific information directly from it. For instance,
# say we want to see a list of all of the links on the page:
links = soup.find_all("a")
print(links)
print("\n")


[<a class="gb1" href="https://www.google.co.il/imghp?hl=iw&amp;tab=wi">חיפוש תמונות</a>, <a class="gb1" href="https://maps.google.co.il/maps?hl=iw&amp;tab=wl">מפות</a>, <a class="gb1" href="https://www.youtube.com/?gl=IL&amp;tab=w1">YouTube</a>, <a class="gb1" href="https://news.google.co.il/nwshp?hl=iw&amp;tab=wn">חדשות</a>, <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a>, <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a>, <a class="gb1" href="https://www.google.com/calendar?tab=wc">יומן</a>, <a class="gb1" href="https://www.google.co.il/intl/iw/about/products?tab=wh" style="text-decoration:none"><u>עוד</u> »</a>, <a class="gb4" href="http://www.google.co.il/history/optout?hl=iw">היסטוריית אתרים</a>, <a class="gb4" href="/preferences?hl=iw">הגדרות</a>, <a class="gb4" href="https://accounts.google.com/ServiceLogin?hl=iw&amp;passive=true&amp;continue=https://www.google.com/" id="gb_70" target="_top">כניסה</a>, <a href="/advanced_search?hl=iw&amp;authu

In [8]:

# Perhaps we just want to extract the link that has contains the text
# "About" on the page instead of every link. We can use the built-in
# "text" function to access the text content between the <a> </a>
# tags.
for link in links:
    if "Google" in link.text:
        print(link)
        print(link.attrs['href'])

<a href="/intl/iw/ads/"> פרסום ב-Google</a>
/intl/iw/ads/
<a href="/intl/iw/about.html">הכל על Google</a>
/intl/iw/about.html
<a dir="ltr" href="https://www.google.com/setprefdomain?prefdom=IL&amp;prev=https://www.google.co.il/&amp;sig=K_CugbAHsVv5fwF6fnFMRhBuDsd5s%3D">Google.co.il</a>
https://www.google.com/setprefdomain?prefdom=IL&prev=https://www.google.co.il/&sig=K_CugbAHsVv5fwF6fnFMRhBuDsd5s%3D
