Part 1: Play with a small sample of html doc: extract all page links from the sample
    
Part 2: Parse one file to get all of its links

In [2]:
import string
import os
from bs4 import BeautifulSoup

### BeautifulSoup documentation
https://www.crummy.com/software/BeautifulSoup/bs4/doc/

## Part 1: Play with a small sample of html doc

In [34]:
# html_doc is an excerpt that I pull out of index_pages/a.html
# The purpose here is to use BeautifulSoup to parse the html_doc and find relevant page links

html_doc = """
<html><head><title>Whatever the title</title></head>
<body>
... skipped ...

<div class="vitamins-list-container" data-metrics-module="vs-az">
  <ul class="vitamins-list">
    <li><a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-266/abscess-root">ABSCESS ROOT</a></li>
    <li><a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-267/abuta">ABUTA</a></li>
    <li><a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-268/acacia">ACACIA</a></li>
    <li><a data-metrics-link="link" href="https://www.webmd.com/vitamins-supplements/ingredientmono-1550-AYAHUASCA.aspx?activeIngredientId=1550&amp;activeIngredientName=AYAHUASCA&amp;source=0">AYAHUASCA</a></li>
  </ul>
</div><!--END of vitamin_supplementsaz_widget--><!--vitamin_searchresult_widget:  091e9c5e81809003--><!--END of vitamin_searchresult_widget--></div><div id="ContentPane31" class="pane" tabindex="-1"><!--$$ContentPane31$$--></div><div id="ContentPane32" class="pane" tabindex="-1"><!--$$ContentPane32$$--></div><div id="ContentPane33" class="pane" tabindex="-1"><!--$$ContentPane33$$--></div><div id="ContentPane34" class="pane" tabindex="-1"><!--$$ContentPane34$$--></div><div id="ContentPane35" class="pane" tabindex="-1"></div><div id="ContentPane36" class="pane" tabindex="-1"><!--$$ContentPane36$$--></div></div>
            <div class="main-container main-container-3" tabindex="-1" data-js="main-container-3"><div id="ContentPane37" class="pane" tabindex="-1"><aside class="module module-unified-right-rail module-ed-urr oas-enabled" id="ed-urr" data-metrics-module="ed-urr" role="complementary">

... skipped ...
"""

In [37]:
# call bs4 to parse html_doc
soup = BeautifulSoup(html_doc, 'html.parser')

In [44]:
# we are only interested in the part of "<ul class=vitamins-list>...</ul>"
scope = soup.select_one(".vitamins-list")
scope

<ul class="vitamins-list">
<li><a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-266/abscess-root">ABSCESS ROOT</a></li>
<li><a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-267/abuta">ABUTA</a></li>
<li><a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-268/acacia">ACACIA</a></li>
<li><a data-metrics-link="link" href="https://www.webmd.com/vitamins-supplements/ingredientmono-1550-AYAHUASCA.aspx?activeIngredientId=1550&amp;activeIngredientName=AYAHUASCA&amp;source=0">AYAHUASCA</a></li>
</ul>

In [45]:
# get all anchor elements
anchors = scope.find_all('a')
anchors

[<a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-266/abscess-root">ABSCESS ROOT</a>,
 <a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-267/abuta">ABUTA</a>,
 <a data-metrics-link="link" href="https://www.webmd.com/vitamins/ai/ingredientmono-268/acacia">ACACIA</a>,
 <a data-metrics-link="link" href="https://www.webmd.com/vitamins-supplements/ingredientmono-1550-AYAHUASCA.aspx?activeIngredientId=1550&amp;activeIngredientName=AYAHUASCA&amp;source=0">AYAHUASCA</a>]

In [46]:
# get the name and the href attribute (url location) of each anchor
for anchor in anchors:
    name = anchor.string
    href = anchor['href']
    print(name, href)

ABSCESS ROOT https://www.webmd.com/vitamins/ai/ingredientmono-266/abscess-root
ABUTA https://www.webmd.com/vitamins/ai/ingredientmono-267/abuta
ACACIA https://www.webmd.com/vitamins/ai/ingredientmono-268/acacia
AYAHUASCA https://www.webmd.com/vitamins-supplements/ingredientmono-1550-AYAHUASCA.aspx?activeIngredientId=1550&activeIngredientName=AYAHUASCA&source=0


## Part 2: Parse one whole file

In [58]:
# read the file index_pages/a.html
folder = "../HW1/index_pages"
letter = "a"
filename = letter + ".html"
filepath = folder + "/" + filename
html_doc = open(filepath).read()

In [48]:
soup = BeautifulSoup(html_doc, 'html.parser')
scope = soup.select_one(".vitamins-list")
anchors = scope.find_all('a')
for anchor in anchors:
    name = anchor.string
    href = anchor['href']
    
    print(name, href)

ABSCESS ROOT https://www.webmd.com/vitamins/ai/ingredientmono-266/abscess-root
ABUTA https://www.webmd.com/vitamins/ai/ingredientmono-267/abuta
ACACIA https://www.webmd.com/vitamins/ai/ingredientmono-268/acacia
ACACIA RIGIDULA https://www.webmd.com/vitamins/ai/ingredientmono-1411/acacia-rigidula
ACAI https://www.webmd.com/vitamins/ai/ingredientmono-1109/acai
ACEROLA https://www.webmd.com/vitamins/ai/ingredientmono-608/acerola
ACETYL-L-CARNITINE https://www.webmd.com/vitamins/ai/ingredientmono-834/acetyl-l-carnitine
ACKEE https://www.webmd.com/vitamins/ai/ingredientmono-817/ackee
ACONITE https://www.webmd.com/vitamins/ai/ingredientmono-609/aconite
ACTIVATED CHARCOAL https://www.webmd.com/vitamins/ai/ingredientmono-269/activated-charcoal
ADENOSINE https://www.webmd.com/vitamins/ai/ingredientmono-1067/adenosine
ADRENAL EXTRACT https://www.webmd.com/vitamins/ai/ingredientmono-941/adrenal-extract
ADRUE https://www.webmd.com/vitamins/ai/ingredientmono-48/adrue
AFRICAN WILD POTATO https://www